From b8f6341fbb6507b029c29a3b6c3926b62e0671b3 Mon Sep 17 00:00:00 2001
From: lilin <lilin@pjlab.org.cn>
Date: Tue, 11 Oct 2022 16:35:51 +0800
Subject: [PATCH 1/8] [feat] support MViT

---
 configs/_base_/models/mvit_small.py           |  20 +
 configs/recognition/mvit/README.md            |  77 ++
 .../mvit-base-p244_32x3x1_kinetics400-rgb.py  | 138 +++
 .../mvit/mvit-base-p244_u32_sthv2-rgb.py      | 141 +++
 .../mvit-large-p244_40x3x1_kinetics400-rgb.py | 141 +++
 .../mvit/mvit-large-p244_u40_sthv2-rgb.py     | 143 +++
 .../mvit-small-p244_16x4x1_kinetics400-rgb.py | 132 +++
 .../mvit/mvit-small-p244_u16_sthv2-rgb.py     | 135 +++
 mmaction/datasets/transforms/__init__.py      |   4 +-
 mmaction/datasets/transforms/loading.py       | 250 ++++++
 mmaction/datasets/transforms/pose_loading.py  | 135 ---
 mmaction/datasets/transforms/processing.py    | 170 ++++
 mmaction/models/backbones/__init__.py         |   4 +-
 mmaction/models/backbones/mvit.py             | 850 ++++++++++++++++++
 mmaction/models/heads/__init__.py             |   4 +-
 mmaction/models/heads/mvit_head.py            |  71 ++
 mmaction/models/recognizers/recognizer3d.py   |  25 +-
 mmaction/models/utils/__init__.py             |   7 +-
 mmaction/models/utils/blending_utils.py       |  68 ++
 mmaction/models/utils/embed.py                | 234 +++++
 .../datasets/transforms/test_pose_loading.py  |  88 +-
 tests/datasets/transforms/test_sampling.py    |  87 +-
 tests/models/backbones/test_mvit.py           | 134 +++
 tests/models/utils/test_blending_utils.py     |  42 +-
 24 files changed, 2860 insertions(+), 240 deletions(-)
 create mode 100644 configs/_base_/models/mvit_small.py
 create mode 100644 configs/recognition/mvit/README.md
 create mode 100644 configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py
 create mode 100644 configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py
 create mode 100644 configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py
 create mode 100644 configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py
 create mode 100644 configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py
 create mode 100644 configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py
 create mode 100644 mmaction/models/backbones/mvit.py
 create mode 100644 mmaction/models/heads/mvit_head.py
 create mode 100644 mmaction/models/utils/embed.py
 create mode 100644 tests/models/backbones/test_mvit.py

diff --git a/configs/_base_/models/mvit_small.py b/configs/_base_/models/mvit_small.py
new file mode 100644
index 0000000000..727df37c38
--- /dev/null
+++ b/configs/_base_/models/mvit_small.py
@@ -0,0 +1,20 @@
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(type='MViT', arch='small', drop_path_rate=0.2),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        blending=dict(
+            type='RandomBatchAugment',
+            augments=[
+                dict(type='MixupBlending', alpha=0.8, num_classes=400),
+                dict(type='CutmixBlending', alpha=1, num_classes=400)
+            ]),
+        format_shape='NCTHW'),
+    cls_head=dict(
+        type='MVitHead',
+        in_channels=768,
+        num_classes=400,
+        label_smooth_eps=0.1,
+        average_clips='prob'))
diff --git a/configs/recognition/mvit/README.md b/configs/recognition/mvit/README.md
new file mode 100644
index 0000000000..fdc694a128
--- /dev/null
+++ b/configs/recognition/mvit/README.md
@@ -0,0 +1,77 @@
+# MViT V2
+
+> [MViTv2: Improved Multiscale Vision Transformers for Classification and Detection](http://openaccess.thecvf.com//content/CVPR2022/papers/Li_MViTv2_Improved_Multiscale_Vision_Transformers_for_Classification_and_Detection_CVPR_2022_paper.pdf)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+In this paper, we study Multiscale Vision Transformers (MViTv2) as a unified architecture for image and video
+classification, as well as object detection. We present an improved version of MViT that incorporates
+decomposed relative positional embeddings and residual pooling connections. We instantiate this architecture
+in five sizes and evaluate it for ImageNet classification, COCO detection and Kinetics video recognition where
+it outperforms prior work. We further compare MViTv2s' pooling attention to window attention mechanisms where
+it outperforms the latter in accuracy/compute. Without bells-and-whistles, MViTv2 has state-of-the-art
+performance in 3 domains: 88.8% accuracy on ImageNet classification, 58.7 boxAP on COCO object detection as
+well as 86.1% on Kinetics-400 video classification.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/33249023/196627033-03a4e9b1-082e-42ee-a2a0-77f874fe632a.png" width="50%"/>
+</div>
+
+## Results and models
+
+### Kinetics-400
+
+| frame sampling strategy |   resolution   |  backbone  |   pretrain   | top1 acc | top5 acc |        reference top1 acc        |        reference top1 acc        | testing protocol | params |        config        |        ckpt         |
+| :---------------------: | :------------: | :--------: | :----------: | :------: | :------: | :------------------------------: | :------------------------------: | :--------------: | :----: | :------------------: | :-----------------: |
+|         16x4x1          | short-side 320 | MViTv2-S\* | From scratch |   81.1   |   94.7   | [81.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [94.6](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 1 crop | xx.xM  | [config](/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) |
+|         32x3x1          | short-side 320 | MViTv2-B\* | From scratch |   82.6   |   95.8   | [82.9](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [95.7](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 1 crop | xx.xM  | [config](/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) |
+|         40x3x1          | short-side 320 | MViTv2-L\* | From scratch |   85.4   |   96.2   | [86.1](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [97.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 3 crop | xx.xM  | [config](/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) |
+
+### Something-Something V2
+
+| frame sampling strategy |   resolution   |  backbone  |   pretrain   | top1 acc | top5 acc |        reference top1 acc        |        reference top1 acc        | testing protocol | params |        config        |        ckpt         |
+| :---------------------: | :------------: | :--------: | :----------: | :------: | :------: | :------------------------------: | :------------------------------: | :--------------: | :----: | :------------------: | :-----------------: |
+|       uniform 16        | short-side 320 | MViTv2-S\* |     K400     |   68.1   |   91.0   | [68.2](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [91.4](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crop | xx.xM  | [config](/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) |
+|       uniform 32        | short-side 320 | MViTv2-B\* |     K400     |   70.8   |   92.7   | [70.5](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [92.7](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crop | xx.xM  | [config](/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) |
+|       uniform 40        | short-side 320 | MViTv2-L\* | IN21K + K400 |   73.2   |   94.0   | [73.3](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [94.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crop | xx.xM  | [config](/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) |
+
+*Models with * are ported from the repo [SlowFast](https://github.com/facebookresearch/SlowFast/) and tested on our data. Currently, we only support the testing of X3D models, training will be available soon.*
+
+1. The values in columns named after "reference" are copied from paper
+2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available.
+
+For more details on data preparation, you can refer to [Kinetics400](/tools/data/kinetics/README.md).
+
+## Test
+
+You can use the following command to test a model.
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+Example: test MViT model on Kinetics-400 dataset and dump the result to a pkl file.
+
+```shell
+python tools/test.py configs/recognition/mvit/mvit-small_16x4x1_kinetics400-rgb.py \
+    checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
+```
+
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+
+## Citation
+
+```bibtex
+@inproceedings{li2021improved,
+  title={MViTv2: Improved multiscale vision transformers for classification and detection},
+  author={Li, Yanghao and Wu, Chao-Yuan and Fan, Haoqi and Mangalam, Karttikeya and Xiong, Bo and Malik, Jitendra and Feichtenhofer, Christoph},
+  booktitle={CVPR},
+  year={2022}
+}
+```
diff --git a/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py b/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py
new file mode 100644
index 0000000000..93b33a9dc9
--- /dev/null
+++ b/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py
@@ -0,0 +1,138 @@
+_base_ = [
+    '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        arch='base',
+        temporal_size=32,
+        drop_path_rate=0.3,
+    ))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='SampleFrames', clip_len=32, frame_interval=3, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='RandomErasing', erase_prob=0.25, mode='rand'),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=3,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=3,
+        num_clips=5,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=30, val_begin=1, val_interval=3)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=1.6e-3, betas=(0.9, 0.999), weight_decay=0.05))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=30,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=200,
+        eta_min=0,
+        by_epoch=True,
+        begin=0,
+        end=200,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=64)
diff --git a/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py b/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py
new file mode 100644
index 0000000000..c719396f29
--- /dev/null
+++ b/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py
@@ -0,0 +1,141 @@
+_base_ = [
+    '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        arch='base',
+        temporal_size=32,
+        drop_path_rate=0.3,
+    ),
+    cls_head=dict(num_classes=174))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/sthv2/videos'
+data_root_val = 'data/sthv2/videos'
+ann_file_train = 'data/sthv2/sthv2_train_list_videos.txt'
+ann_file_val = 'data/sthv2/sthv2_val_list_videos.txt'
+ann_file_test = 'data/sthv2/sthv2_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSampleFrames',
+        clip_len=32,
+        out_of_bound_opt='repeat_frame'),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='RandomErasing', erase_prob=0.25, mode='rand'),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSampleFrames',
+        clip_len=32,
+        out_of_bound_opt='repeat_frame',
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSampleFrames',
+        clip_len=32,
+        out_of_bound_opt='repeat_frame',
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=3)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+base_lr = 1.6e-3
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=30,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=70,
+        eta_min=base_lr / 100,
+        by_epoch=True,
+        begin=30,
+        end=100,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=64)
diff --git a/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py b/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py
new file mode 100644
index 0000000000..883d9f7ce5
--- /dev/null
+++ b/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py
@@ -0,0 +1,141 @@
+_base_ = [
+    '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        arch='large',
+        temporal_size=40,
+        spatial_size=312,
+        drop_path_rate=0.75,
+    ),
+    cls_head=dict(in_channels=1152),
+    test_cfg=dict(max_testing_views=5))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='SampleFrames', clip_len=40, frame_interval=3, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 356)),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(312, 312), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='RandomErasing', erase_prob=0.25, mode='rand'),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=40,
+        frame_interval=3,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 356)),
+    dict(type='CenterCrop', crop_size=312),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=40,
+        frame_interval=3,
+        num_clips=5,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 312)),
+    dict(type='ThreeCrop', crop_size=312),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=30, val_begin=1, val_interval=3)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=1.6e-3, betas=(0.9, 0.999), weight_decay=0.05))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=30,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=200,
+        eta_min=0,
+        by_epoch=True,
+        begin=0,
+        end=200,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=512)
diff --git a/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py b/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py
new file mode 100644
index 0000000000..c682571df6
--- /dev/null
+++ b/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py
@@ -0,0 +1,143 @@
+_base_ = [
+    '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        arch='large',
+        temporal_size=40,
+        spatial_size=312,
+        drop_path_rate=0.75,
+    ),
+    cls_head=dict(in_channels=1152, num_classes=174),
+    test_cfg=dict(max_testing_views=5))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/sthv2/videos'
+data_root_val = 'data/sthv2/videos'
+ann_file_train = 'data/sthv2/sthv2_train_list_videos.txt'
+ann_file_val = 'data/sthv2/sthv2_val_list_videos.txt'
+ann_file_test = 'data/sthv2/sthv2_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSampleFrames',
+        clip_len=40,
+        out_of_bound_opt='repeat_frame'),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='RandomErasing', erase_prob=0.25, mode='rand'),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSampleFrames',
+        clip_len=40,
+        out_of_bound_opt='repeat_frame',
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSampleFrames',
+        clip_len=40,
+        out_of_bound_opt='repeat_frame',
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=3)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+base_lr = 1.6e-3
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=30,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=70,
+        eta_min=base_lr / 100,
+        by_epoch=True,
+        begin=30,
+        end=100,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=10))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=64)
diff --git a/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py b/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py
new file mode 100644
index 0000000000..0df0b835fa
--- /dev/null
+++ b/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py
@@ -0,0 +1,132 @@
+_base_ = [
+    '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py'
+]
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='SampleFrames', clip_len=16, frame_interval=4, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='RandomErasing', erase_prob=0.25, mode='rand'),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=16,
+        frame_interval=4,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=16,
+        frame_interval=4,
+        num_clips=5,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=200, val_begin=1, val_interval=3)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+base_lr = 1.6e-3
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=30,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=200,
+        eta_min=base_lr / 100,
+        by_epoch=True,
+        begin=30,
+        end=200,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=512)
diff --git a/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py b/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py
new file mode 100644
index 0000000000..7327df2e11
--- /dev/null
+++ b/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py
@@ -0,0 +1,135 @@
+_base_ = [
+    '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py'
+]
+
+model = dict(cls_head=dict(num_classes=174))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/sthv2/videos'
+data_root_val = 'data/sthv2/videos'
+ann_file_train = 'data/sthv2/sthv2_train_list_videos.txt'
+ann_file_val = 'data/sthv2/sthv2_val_list_videos.txt'
+ann_file_test = 'data/sthv2/sthv2_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSampleFrames',
+        clip_len=16,
+        out_of_bound_opt='repeat_frame'),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='RandomErasing', erase_prob=0.25, mode='rand'),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSampleFrames',
+        clip_len=16,
+        out_of_bound_opt='repeat_frame',
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSampleFrames',
+        clip_len=16,
+        out_of_bound_opt='repeat_frame',
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=3)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+base_lr = 1.6e-3
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=30,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=100,
+        eta_min=base_lr / 100,
+        by_epoch=True,
+        begin=30,
+        end=100,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=64)
diff --git a/mmaction/datasets/transforms/__init__.py b/mmaction/datasets/transforms/__init__.py
index f21e6d01b9..7aaaee894d 100644
--- a/mmaction/datasets/transforms/__init__.py
+++ b/mmaction/datasets/transforms/__init__.py
@@ -10,9 +10,9 @@
                       LoadProposals, OpenCVDecode, OpenCVInit, PIMSDecode,
                       PIMSInit, PyAVDecode, PyAVDecodeMotionVector, PyAVInit,
                       RawFrameDecode, SampleAVAFrames, SampleFrames,
-                      UntrimmedSampleFrames)
+                      UniformSampleFrames, UntrimmedSampleFrames)
 from .pose_loading import (GeneratePoseTarget, LoadKineticsPose,
-                           PaddingWithLoop, PoseDecode, UniformSampleFrames)
+                           PaddingWithLoop, PoseDecode)
 from .processing import (AudioAmplify, CenterCrop, ColorJitter, Flip, Fuse,
                          MelSpectrogram, MultiScaleCrop, PoseCompact,
                          RandomCrop, RandomRescale, RandomResizedCrop, Resize,
diff --git a/mmaction/datasets/transforms/loading.py b/mmaction/datasets/transforms/loading.py
index ceb761d638..e756410dac 100644
--- a/mmaction/datasets/transforms/loading.py
+++ b/mmaction/datasets/transforms/loading.py
@@ -265,6 +265,256 @@ def __repr__(self):
         return repr_str
 
 
+@TRANSFORMS.register_module()
+class SampleFramesV2(SampleFrames):
+    """Sample frames from the video.
+
+    Required keys are "total_frames", "start_index" , added or modified keys
+    are "frame_inds", "frame_interval" and "num_clips".
+    Args:
+        clip_len (int): Frames of each sampled output clip.
+        frame_interval (int): Temporal interval of adjacent sampled frames.
+            Default: 1.
+        num_clips (int): Number of clips to be sampled. Default: 1.
+        temporal_jitter (bool): Whether to apply temporal jittering.
+            Default: False.
+        out_of_bound_opt (str): The way to deal with out of bounds frame
+            indexes. Available options are 'loop', 'repeat_last'.
+            Default: 'loop'.
+        test_mode (bool): Store True when building test or validation dataset.
+            Default: False.
+        start_index (None): This argument is deprecated and moved to dataset
+            class (``BaseDataset``, ``VideoDatset``, ``RawframeDataset``, etc),
+            see this: https://github.com/open-mmlab/mmaction2/pull/89.
+        keep_tail_frames (bool): Whether to keep tail frames when sampling.
+            Default: False.
+    """
+
+    def __init__(self,
+                 clip_len,
+                 frame_interval=1,
+                 num_clips=1,
+                 temporal_jitter=False,
+                 out_of_bound_opt='loop',
+                 test_mode=False,
+                 keep_tail_frames=False):
+        super().__init__(clip_len, frame_interval, num_clips, temporal_jitter,
+                         False, out_of_bound_opt, test_mode, keep_tail_frames)
+
+    def _get_train_clips(self, num_frames):
+        """Get clip offsets in train mode.
+
+        Args:
+            num_frames (int): Total number of frame in the video.
+        Returns:
+            np.ndarray: Sampled frame indices in train mode.
+        """
+        ori_clip_len = (self.clip_len - 1) * self.frame_interval + 1
+        max_offset = max(num_frames - ori_clip_len, 0)
+
+        num_segments = max(self.num_clips - 1, 1)
+        offset_between = max_offset / num_segments
+        clip_offsets = np.arange(self.num_clips) * offset_between
+        clip_offsets += np.random.uniform(0, offset_between, self.num_clips)
+        clip_offsets = np.round(clip_offsets).astype(np.int32)
+        return clip_offsets
+
+    def _get_test_clips(self, num_frames):
+        """Get clip offsets in test mode.
+
+        If the total number of frames is
+        not enough, it will return all zero indices.
+        Args:
+            num_frames (int): Total number of frame in the video.
+        Returns:
+            np.ndarray: Sampled frame indices in test mode.
+        """
+        ori_clip_len = (self.clip_len - 1) * self.frame_interval + 1
+        max_offset = max(num_frames - ori_clip_len, 0)
+
+        num_segments = max(self.num_clips - 1, 1)
+        offset_between = max_offset / float(num_segments)
+        clip_offsets = np.arange(self.num_clips) * offset_between
+        clip_offsets = np.round(clip_offsets).astype(np.int32)
+        return clip_offsets
+
+
+@TRANSFORMS.register_module()
+class UniformSampleFrames(BaseTransform):
+    """Uniformly sample frames from the video.
+
+    To sample an n-frame clip from the video. UniformSampleFrames basically
+    divide the video into n segments of equal length and randomly sample one
+    frame from each segment. To make the testing results reproducible, a
+    random seed is set during testing, to make the sampling results
+    deterministic.
+
+    Required keys are "total_frames", "start_index" , added or modified keys
+    are "frame_inds", "clip_len", "frame_interval" and "num_clips".
+
+    Args:
+        clip_len (int): Frames of each sampled output clip.
+        num_clips (int): Number of clips to be sampled. Default: 1.
+        test_mode (bool): Store True when building test or validation dataset.
+            Default: False.
+        out_of_bound_opt (str): The way to deal with out of bounds frame
+            indexes. Available options are 'loop', 'repeat_frame'.
+            Default: 'loop'.
+        seed (int): The random seed used during test time. Default: 255.
+    """
+
+    def __init__(self,
+                 clip_len,
+                 num_clips=1,
+                 test_mode=False,
+                 seed=255,
+                 out_of_bound_opt='loop'):
+
+        self.clip_len = clip_len
+        self.num_clips = num_clips
+        self.test_mode = test_mode
+        self.seed = seed
+        self.out_of_bound_opt = out_of_bound_opt
+        assert self.out_of_bound_opt in ['loop', 'repeat_frame']
+
+    def _get_train_clips(self, num_frames):
+        """Uniformly sample indices for training clips.
+
+        Args:
+            num_frames (int): The number of frames.
+        """
+
+        assert self.num_clips == 1
+        if num_frames < self.clip_len:
+            start = np.random.randint(0, num_frames)
+            inds = np.arange(start, start + self.clip_len)
+        elif self.clip_len <= num_frames < 2 * self.clip_len:
+            basic = np.arange(self.clip_len)
+            inds = np.random.choice(
+                self.clip_len + 1, num_frames - self.clip_len, replace=False)
+            offset = np.zeros(self.clip_len + 1, dtype=np.int32)
+            offset[inds] = 1
+            offset = np.cumsum(offset)
+            inds = basic + offset[:-1]
+        else:
+            bids = np.array([
+                i * num_frames // self.clip_len
+                for i in range(self.clip_len + 1)
+            ])
+            bsize = np.diff(bids)
+            bst = bids[:self.clip_len]
+            offset = np.random.randint(bsize)
+            inds = bst + offset
+        return inds
+
+    def _get_test_clips(self, num_frames):
+        """Uniformly sample indices for testing clips.
+
+        Args:
+            num_frames (int): The number of frames.
+        """
+
+        np.random.seed(self.seed)
+        if num_frames < self.clip_len:
+            # Then we use a simple strategy
+            if num_frames < self.num_clips:
+                start_inds = list(range(self.num_clips))
+            else:
+                start_inds = [
+                    i * num_frames // self.num_clips
+                    for i in range(self.num_clips)
+                ]
+            inds = np.concatenate(
+                [np.arange(i, i + self.clip_len) for i in start_inds])
+        elif self.clip_len <= num_frames < self.clip_len * 2:
+            all_inds = []
+            for i in range(self.num_clips):
+                basic = np.arange(self.clip_len)
+                inds = np.random.choice(
+                    self.clip_len + 1,
+                    num_frames - self.clip_len,
+                    replace=False)
+                offset = np.zeros(self.clip_len + 1, dtype=np.int32)
+                offset[inds] = 1
+                offset = np.cumsum(offset)
+                inds = basic + offset[:-1]
+                all_inds.append(inds)
+            inds = np.concatenate(all_inds)
+        else:
+            bids = np.array([
+                i * num_frames // self.clip_len
+                for i in range(self.clip_len + 1)
+            ])
+            bsize = np.diff(bids)
+            bst = bids[:self.clip_len]
+            all_inds = []
+            for i in range(self.num_clips):
+                offset = np.random.randint(bsize)
+                all_inds.append(bst + offset)
+            inds = np.concatenate(all_inds)
+        return inds
+
+    def _get_repeat_sample_clips(self, num_frames):
+        """Repeat sample when video is shorter than clip_len Modified from
+        https://github.com/facebookresearch/SlowFast/blob/64ab
+        cc90ccfdcbb11cf91d6e525bed60e92a8796/slowfast/datasets/ssv2.py#L159.
+
+        When video frames is shorter than target clip len, this strategy would
+        repeat sample frame, rather than loop sample in 'loop' mode.
+        In test mode, this strategy would sample the middle frame of each
+        segment, rather than set a random seed, and therefore only support
+        sample 1 clip.
+
+        Args:
+            num_frames (int): Total number of frame in the video.
+        Returns:
+            seq (list): the indexes of frames of sampled from the video.
+        """
+        assert self.num_clips == 1
+        seg_size = float(num_frames - 1) / self.clip_len
+        inds = []
+        for i in range(self.clip_len):
+            start = int(np.round(seg_size * i))
+            end = int(np.round(seg_size * (i + 1)))
+            if not self.test_mode:
+                inds.append(np.random.randint(start, end + 1))
+            else:
+                inds.append((start + end) // 2)
+
+        return np.array(inds)
+
+    def transform(self, results):
+        num_frames = results['total_frames']
+
+        if self.out_of_bound_opt == 'loop':
+            if self.test_mode:
+                inds = self._get_test_clips(num_frames)
+            else:
+                inds = self._get_train_clips(num_frames)
+            inds = np.mod(inds, num_frames)
+        elif self.out_of_bound_opt == 'repeat_frame':
+            inds = self._get_repeat_sample_clips(num_frames)
+        else:
+            raise ValueError('Illegal out_of_bound option.')
+
+        start_index = results['start_index']
+        inds = inds + start_index
+
+        results['frame_inds'] = inds.astype(np.int32)
+        results['clip_len'] = self.clip_len
+        results['frame_interval'] = None
+        results['num_clips'] = self.num_clips
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'clip_len={self.clip_len}, '
+                    f'num_clips={self.num_clips}, '
+                    f'test_mode={self.test_mode}, '
+                    f'seed={self.seed})')
+        return repr_str
+
+
 @TRANSFORMS.register_module()
 class UntrimmedSampleFrames(BaseTransform):
     """Sample frames from the untrimmed video.
diff --git a/mmaction/datasets/transforms/pose_loading.py b/mmaction/datasets/transforms/pose_loading.py
index 58748eacb6..592850334f 100644
--- a/mmaction/datasets/transforms/pose_loading.py
+++ b/mmaction/datasets/transforms/pose_loading.py
@@ -11,141 +11,6 @@
 from .processing import Flip
 
 
-@TRANSFORMS.register_module()
-class UniformSampleFrames(BaseTransform):
-    """Uniformly sample frames from the video.
-
-    To sample an n-frame clip from the video. UniformSampleFrames basically
-    divide the video into n segments of equal length and randomly sample one
-    frame from each segment. To make the testing results reproducible, a
-    random seed is set during testing, to make the sampling results
-    deterministic.
-
-    Required keys are ``'total_frames'``, ``'start_index'`` , added or
-    modified keys are ``'frame_inds'``, ``'clip_len'``,
-    ``'frame_interval'`` and ``'num_clips'``.
-
-    Args:
-        clip_len (int): Frames of each sampled output clip.
-        num_clips (int): Number of clips to be sampled. Defaults to 1.
-        test_mode (bool): Store True when building test or validation dataset.
-            Defaults to False.
-        seed (int): The random seed used during test time. Defaults to 255.
-    """
-
-    def __init__(self, clip_len, num_clips=1, test_mode=False, seed=255):
-
-        self.clip_len = clip_len
-        self.num_clips = num_clips
-        self.test_mode = test_mode
-        self.seed = seed
-
-    def _get_train_clips(self, num_frames, clip_len):
-        """Uniformly sample indices for training clips.
-
-        Args:
-            num_frames (int): The number of frames.
-            clip_len (int): The length of the clip.
-        """
-
-        assert self.num_clips == 1
-        if num_frames < clip_len:
-            start = np.random.randint(0, num_frames)
-            inds = np.arange(start, start + clip_len)
-        elif clip_len <= num_frames < 2 * clip_len:
-            basic = np.arange(clip_len)
-            inds = np.random.choice(
-                clip_len + 1, num_frames - clip_len, replace=False)
-            offset = np.zeros(clip_len + 1, dtype=np.int32)
-            offset[inds] = 1
-            offset = np.cumsum(offset)
-            inds = basic + offset[:-1]
-        else:
-            bids = np.array(
-                [i * num_frames // clip_len for i in range(clip_len + 1)])
-            bsize = np.diff(bids)
-            bst = bids[:clip_len]
-            offset = np.random.randint(bsize)
-            inds = bst + offset
-        return inds
-
-    def _get_test_clips(self, num_frames, clip_len):
-        """Uniformly sample indices for testing clips.
-
-        Args:
-            num_frames (int): The number of frames.
-            clip_len (int): The length of the clip.
-        """
-
-        np.random.seed(self.seed)
-        if num_frames < clip_len:
-            # Then we use a simple strategy
-            if num_frames < self.num_clips:
-                start_inds = list(range(self.num_clips))
-            else:
-                start_inds = [
-                    i * num_frames // self.num_clips
-                    for i in range(self.num_clips)
-                ]
-            inds = np.concatenate(
-                [np.arange(i, i + clip_len) for i in start_inds])
-        elif clip_len <= num_frames < clip_len * 2:
-            all_inds = []
-            for i in range(self.num_clips):
-                basic = np.arange(clip_len)
-                inds = np.random.choice(
-                    clip_len + 1, num_frames - clip_len, replace=False)
-                offset = np.zeros(clip_len + 1, dtype=np.int32)
-                offset[inds] = 1
-                offset = np.cumsum(offset)
-                inds = basic + offset[:-1]
-                all_inds.append(inds)
-            inds = np.concatenate(all_inds)
-        else:
-            bids = np.array(
-                [i * num_frames // clip_len for i in range(clip_len + 1)])
-            bsize = np.diff(bids)
-            bst = bids[:clip_len]
-            all_inds = []
-            for i in range(self.num_clips):
-                offset = np.random.randint(bsize)
-                all_inds.append(bst + offset)
-            inds = np.concatenate(all_inds)
-        return inds
-
-    def transform(self, results):
-        """Perform the SampleFrames loading.
-
-        Args:
-            results (dict): The resulting dict to be modified and passed
-                to the next transform in pipeline.
-        """
-        num_frames = results['total_frames']
-
-        if self.test_mode:
-            inds = self._get_test_clips(num_frames, self.clip_len)
-        else:
-            inds = self._get_train_clips(num_frames, self.clip_len)
-
-        inds = np.mod(inds, num_frames)
-        start_index = results['start_index']
-        inds = inds + start_index
-
-        results['frame_inds'] = inds.astype(np.int32)
-        results['clip_len'] = self.clip_len
-        results['frame_interval'] = None
-        results['num_clips'] = self.num_clips
-        return results
-
-    def __repr__(self):
-        repr_str = (f'{self.__class__.__name__}('
-                    f'clip_len={self.clip_len}, '
-                    f'num_clips={self.num_clips}, '
-                    f'test_mode={self.test_mode}, '
-                    f'seed={self.seed})')
-        return repr_str
-
-
 @TRANSFORMS.register_module()
 class PoseDecode(BaseTransform):
     """Load and decode pose with given indices.
diff --git a/mmaction/datasets/transforms/processing.py b/mmaction/datasets/transforms/processing.py
index 6ea381030f..d34bc93327 100644
--- a/mmaction/datasets/transforms/processing.py
+++ b/mmaction/datasets/transforms/processing.py
@@ -1,12 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import random
 import warnings
+from numbers import Number
+from typing import Sequence
 
 import cv2
 import mmcv
 import mmengine
 import numpy as np
 from mmcv.transforms import BaseTransform
+from mmcv.transforms.utils import cache_randomness
 from torch.nn.modules.utils import _pair
 
 from mmaction.registry import TRANSFORMS
@@ -1491,3 +1494,170 @@ def __repr__(self):
                     f'n_mels={self.n_mels}, '
                     f'fixed_length={self.fixed_length})')
         return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomErasing(BaseTransform):
+    """Randomly selects a rectangle region in an image and erase pixels.
+    basically refer mmcls.
+
+    **Required Keys:**
+
+    - img
+
+    **Modified Keys:**
+
+    - img
+
+    Args:
+        erase_prob (float): Probability that image will be randomly erased.
+            Default: 0.5
+        min_area_ratio (float): Minimum erased area / input image area
+            Default: 0.02
+        max_area_ratio (float): Maximum erased area / input image area
+            Default: 1/3
+        aspect_range (sequence | float): Aspect ratio range of erased area.
+            if float, it will be converted to (aspect_ratio, 1/aspect_ratio)
+            Default: (3/10, 10/3)
+        mode (str): Fill method in erased area, can be:
+
+            - const (default): All pixels are assign with the same value.
+            - rand: each pixel is assigned with a random value in [0, 255]
+
+        fill_color (sequence | Number): Base color filled in erased area.
+            Defaults to (128, 128, 128).
+        fill_std (sequence | Number, optional): If set and ``mode`` is 'rand',
+            fill erased area with random color from normal distribution
+            (mean=fill_color, std=fill_std); If not set, fill erased area with
+            random color from uniform distribution (0~255). Defaults to None.
+
+    Note:
+        See `Random Erasing Data Augmentation
+        <https://arxiv.org/pdf/1708.04896.pdf>`_
+
+        This paper provided 4 modes: RE-R, RE-M, RE-0, RE-255, and use RE-M as
+        default. The config of these 4 modes are:
+
+        - RE-R: RandomErasing(mode='rand')
+        - RE-M: RandomErasing(mode='const', fill_color=(123.67, 116.3, 103.5))
+        - RE-0: RandomErasing(mode='const', fill_color=0)
+        - RE-255: RandomErasing(mode='const', fill_color=255)
+    """
+
+    def __init__(self,
+                 erase_prob=0.5,
+                 min_area_ratio=0.02,
+                 max_area_ratio=1 / 3,
+                 aspect_range=(3 / 10, 10 / 3),
+                 mode='const',
+                 fill_color=(128, 128, 128),
+                 fill_std=None):
+        assert isinstance(erase_prob, float) and 0. <= erase_prob <= 1.
+        assert isinstance(min_area_ratio, float) and 0. <= min_area_ratio <= 1.
+        assert isinstance(max_area_ratio, float) and 0. <= max_area_ratio <= 1.
+        assert min_area_ratio <= max_area_ratio, \
+            'min_area_ratio should be smaller than max_area_ratio'
+        if isinstance(aspect_range, float):
+            aspect_range = min(aspect_range, 1 / aspect_range)
+            aspect_range = (aspect_range, 1 / aspect_range)
+        assert isinstance(aspect_range, Sequence) and len(aspect_range) == 2 \
+            and all(isinstance(x, float) for x in aspect_range), \
+            'aspect_range should be a float or Sequence with two float.'
+        assert all(x > 0 for x in aspect_range), \
+            'aspect_range should be positive.'
+        assert aspect_range[0] <= aspect_range[1], \
+            'In aspect_range (min, max), min should be smaller than max.'
+        assert mode in ['const', 'rand'], \
+            'Please select `mode` from ["const", "rand"].'
+        if isinstance(fill_color, Number):
+            fill_color = [fill_color] * 3
+        assert isinstance(fill_color, Sequence) and len(fill_color) == 3 \
+            and all(isinstance(x, Number) for x in fill_color), \
+            'fill_color should be a float or Sequence with three int.'
+        if fill_std is not None:
+            if isinstance(fill_std, Number):
+                fill_std = [fill_std] * 3
+            assert isinstance(fill_std, Sequence) and len(fill_std) == 3 \
+                and all(isinstance(x, Number) for x in fill_std), \
+                'fill_std should be a float or Sequence with three int.'
+
+        self.erase_prob = erase_prob
+        self.min_area_ratio = min_area_ratio
+        self.max_area_ratio = max_area_ratio
+        self.aspect_range = aspect_range
+        self.mode = mode
+        self.fill_color = fill_color
+        self.fill_std = fill_std
+
+    def _img_fill_pixels(self, img, top, left, h, w):
+        """Fill pixels to the patch of image."""
+        if self.mode == 'const':
+            patch = np.empty((h, w, 3), dtype=np.uint8)
+            patch[:, :] = np.array(self.fill_color, dtype=np.uint8)
+        elif self.fill_std is None:
+            # Uniform distribution
+            patch = np.random.uniform(0, 256, (h, w, 3)).astype(np.uint8)
+        else:
+            # Normal distribution
+            patch = np.random.normal(self.fill_color, self.fill_std, (h, w, 3))
+            patch = np.clip(patch.astype(np.int32), 0, 255).astype(np.uint8)
+
+        img[top:top + h, left:left + w] = patch
+        return img
+
+    def _fill_pixels(self, imgs, top, left, h, w):
+        """Fill pixels to the patch of each image in frame clip."""
+        return [self._img_fill_pixels(img, top, left, h, w) for img in imgs]
+
+    @cache_randomness
+    def random_disable(self):
+        """Randomly disable the transform."""
+        return np.random.rand() > self.erase_prob
+
+    @cache_randomness
+    def random_patch(self, img_h, img_w):
+        """Randomly generate patch the erase."""
+        # convert the aspect ratio to log space to equally handle width and
+        # height.
+        log_aspect_range = np.log(
+            np.array(self.aspect_range, dtype=np.float32))
+        aspect_ratio = np.exp(np.random.uniform(*log_aspect_range))
+        area = img_h * img_w
+        area *= np.random.uniform(self.min_area_ratio, self.max_area_ratio)
+
+        h = min(int(round(np.sqrt(area * aspect_ratio))), img_h)
+        w = min(int(round(np.sqrt(area / aspect_ratio))), img_w)
+        top = np.random.randint(0, img_h - h) if img_h > h else 0
+        left = np.random.randint(0, img_w - w) if img_w > w else 0
+        return top, left, h, w
+
+    def transform(self, results):
+        """
+        Args:
+            results (dict): Results dict from pipeline
+
+        Returns:
+            dict: Results after the transformation.
+        """
+        if self.random_disable():
+            return results
+
+        imgs = results['imgs']
+        img_h, img_w = imgs[0].shape[:2]
+
+        imgs = self._fill_pixels(imgs, *self.random_patch(img_h, img_w))
+
+        results['imgs'] = imgs
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(erase_prob={self.erase_prob}, '
+        repr_str += f'min_area_ratio={self.min_area_ratio}, '
+        repr_str += f'max_area_ratio={self.max_area_ratio}, '
+        repr_str += f'aspect_range={self.aspect_range}, '
+        repr_str += f'mode={self.mode}, '
+        repr_str += f'fill_color={self.fill_color}, '
+        repr_str += f'fill_std={self.fill_std})'
+        return repr_str
diff --git a/mmaction/models/backbones/__init__.py b/mmaction/models/backbones/__init__.py
index 30301b2b28..6a2c7b526a 100644
--- a/mmaction/models/backbones/__init__.py
+++ b/mmaction/models/backbones/__init__.py
@@ -4,6 +4,7 @@
 from .c3d import C3D
 from .mobilenet_v2 import MobileNetV2
 from .mobilenet_v2_tsm import MobileNetV2TSM
+from .mvit import MViT
 from .resnet import ResNet
 from .resnet2plus1d import ResNet2Plus1d
 from .resnet3d import ResNet3d, ResNet3dLayer
@@ -24,5 +25,6 @@
     'C2D', 'C3D', 'ResNet', 'ResNet3d', 'ResNetTSM', 'ResNet2Plus1d',
     'ResNet3dSlowFast', 'ResNet3dSlowOnly', 'ResNet3dCSN', 'ResNetTIN', 'X3D',
     'ResNet3dLayer', 'MobileNetV2TSM', 'MobileNetV2', 'TANet', 'TimeSformer',
-    'STGCN', 'AGCN', 'ResNetAudio', 'SwinTransformer3D', 'VisionTransformer'
+    'STGCN', 'AGCN', 'ResNetAudio', 'SwinTransformer3D', 'VisionTransformer',
+    'MViT'
 ]
diff --git a/mmaction/models/backbones/mvit.py b/mmaction/models/backbones/mvit.py
new file mode 100644
index 0000000000..7974767cfc
--- /dev/null
+++ b/mmaction/models/backbones/mvit.py
@@ -0,0 +1,850 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks import DropPath
+from mmengine.model import BaseModule, ModuleList
+from mmengine.model.weight_init import trunc_normal_
+from mmengine.utils import to_3tuple
+
+from mmaction.registry import MODELS
+from ..utils.embed import PatchEmbed3D
+
+
+def resize_pos_embed(pos_embed,
+                     src_shape,
+                     dst_shape,
+                     mode='trilinear',
+                     num_extra_tokens=1):
+    """Resize pos_embed weights.
+
+    Args:
+        pos_embed (torch.Tensor): Position embedding weights with shape
+            [1, L, C].
+        src_shape (tuple): The resolution of downsampled origin training
+            image, in format (T, H, W).
+        dst_shape (tuple): The resolution of downsampled new training
+            image, in format (T, H, W).
+        mode (str): Algorithm used for upsampling. Choose one from 'nearest',
+            'linear', 'bilinear', 'bicubic' and 'trilinear'.
+            Defaults to 'trilinear'.
+        num_extra_tokens (int): The number of extra tokens, such as cls_token.
+            Defaults to 1.
+
+    Returns:
+        torch.Tensor: The resized pos_embed of shape [1, L_new, C]
+    """
+    if src_shape[0] == dst_shape[0] and src_shape[1] == dst_shape[1] \
+            and src_shape[2] == dst_shape[2]:
+        return pos_embed
+    assert pos_embed.ndim == 3, 'shape of pos_embed must be [1, L, C]'
+    _, L, C = pos_embed.shape
+    src_t, src_h, src_w = src_shape
+    assert L == src_t * src_h * src_w + num_extra_tokens, \
+        f"The length of `pos_embed` ({L}) doesn't match the expected " \
+        f'shape ({src_t}*{src_h}*{src_w}+{num_extra_tokens}).' \
+        'Please check the `img_size` argument.'
+    extra_tokens = pos_embed[:, :num_extra_tokens]
+
+    src_weight = pos_embed[:, num_extra_tokens:]
+    src_weight = src_weight.reshape(1, src_t, src_h, src_w,
+                                    C).permute(0, 4, 1, 2, 3)
+
+    dst_weight = F.interpolate(
+        src_weight, size=dst_shape, align_corners=False, mode=mode)
+    dst_weight = torch.flatten(dst_weight, 2).transpose(1, 2)
+
+    return torch.cat((extra_tokens, dst_weight), dim=1)
+
+
+def resize_decomposed_rel_pos(rel_pos, q_size, k_size):
+    """Get relative positional embeddings according to the relative positions
+    of query and key sizes.
+
+    Args:
+        rel_pos (Tensor): relative position embeddings (L, C).
+        q_size (int): size of query q.
+        k_size (int): size of key k.
+
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        # Interpolate rel pos.
+        resized = F.interpolate(
+            # (L, C) -> (1, C, L)
+            rel_pos.transpose(0, 1).unsqueeze(0),
+            size=max_rel_dist,
+            mode='linear',
+        )
+        # (1, C, L) -> (L, C)
+        resized = resized.squeeze(0).transpose(0, 1)
+    else:
+        resized = rel_pos
+
+    # Scale the coords with short length if shapes for q and k are different.
+    q_h_ratio = max(k_size / q_size, 1.0)
+    k_h_ratio = max(q_size / k_size, 1.0)
+    q_coords = torch.arange(q_size)[:, None] * q_h_ratio
+    k_coords = torch.arange(k_size)[None, :] * k_h_ratio
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * k_h_ratio
+
+    return resized[relative_coords.long()]
+
+
+def add_decomposed_rel_pos(attn,
+                           q,
+                           q_shape,
+                           k_shape,
+                           rel_pos_h,
+                           rel_pos_w,
+                           rel_pos_t,
+                           with_cls_token=False):
+    """Spatiotemporal Relative Positional Embeddings."""
+    sp_idx = 1 if with_cls_token else 0
+    B, num_heads, _, C = q.shape
+    q_t, q_h, q_w = q_shape
+    k_t, k_h, k_w = k_shape
+
+    Rt = resize_decomposed_rel_pos(rel_pos_t, q_t, k_t)
+    Rh = resize_decomposed_rel_pos(rel_pos_h, q_h, k_h)
+    Rw = resize_decomposed_rel_pos(rel_pos_w, q_w, k_w)
+
+    r_q = q[:, :, sp_idx:].reshape(B, num_heads, q_t, q_h, q_w, C)
+    rel_t = torch.einsum('bythwc,tkc->bythwk', r_q, Rt)
+    rel_h = torch.einsum('bythwc,hkc->bythwk', r_q, Rh)
+    rel_w = torch.einsum('bythwc,wkc->bythwk', r_q, Rw)
+    rel_pos_embed = (
+        rel_t[:, :, :, :, :, :, None, None] +
+        rel_h[:, :, :, :, :, None, :, None] +
+        rel_w[:, :, :, :, :, None, None, :])
+
+    attn_map = attn[:, :, sp_idx:, sp_idx:].view(B, -1, q_t, q_h, q_w, k_t,
+                                                 k_h, k_w)
+    attn_map += rel_pos_embed
+    attn[:, :, sp_idx:, sp_idx:] = attn_map.view(B, -1, q_t * q_h * q_w,
+                                                 k_t * k_h * k_w)
+
+    return attn
+
+
+class MLP(BaseModule):
+    """Two-layer multilayer perceptron.
+
+    Comparing with :class:`mmcv.cnn.bricks.transformer.FFN`, this class allows
+    different input and output channel numbers.
+
+    Args:
+        in_channels (int): The number of input channels.
+        hidden_channels (int, optional): The number of hidden layer channels.
+            If None, same as the ``in_channels``. Defaults to None.
+        out_channels (int, optional): The number of output channels. If None,
+            same as the ``in_channels``. Defaults to None.
+        act_cfg (dict): The config of activation function.
+            Defaults to ``dict(type='GELU')``.
+        init_cfg (dict, optional): The config of weight initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 hidden_channels=None,
+                 out_channels=None,
+                 act_cfg=dict(type='GELU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        out_channels = out_channels or in_channels
+        hidden_channels = hidden_channels or in_channels
+        self.fc1 = nn.Linear(in_channels, hidden_channels)
+        self.act = build_activation_layer(act_cfg)
+        self.fc2 = nn.Linear(hidden_channels, out_channels)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+
+
+def attention_pool(x: torch.Tensor,
+                   pool: nn.Module,
+                   in_size: tuple,
+                   with_cls_token: bool = False,
+                   norm: Optional[nn.Module] = None):
+    """Pooling the feature tokens.
+
+    Args:
+        x (torch.Tensor): The input tensor, should be with shape
+            ``(B, num_heads, L, C)`` or ``(B, L, C)``.
+        pool (nn.Module): The pooling module.
+        in_size (Tuple[int]): The shape of the input feature map.
+        with_cls_token (bool): Whether concatenating class token into video
+            tokens as transformer input. Defaults to True.
+        norm (nn.Module, optional): The normalization module.
+            Defaults to None.
+    """
+    ndim = x.ndim
+    if ndim == 4:
+        B, num_heads, L, C = x.shape
+    elif ndim == 3:
+        num_heads = 1
+        B, L, C = x.shape
+        x = x.unsqueeze(1)
+    else:
+        raise RuntimeError(f'Unsupported input dimension {x.shape}')
+
+    T, H, W = in_size
+    assert L == T * H * W + with_cls_token
+
+    if with_cls_token:
+        cls_tok, x = x[:, :, :1, :], x[:, :, 1:, :]
+
+    # (B, num_heads, T*H*W, C) -> (B*num_heads, C, T, H, W)
+    x = x.reshape(B * num_heads, T, H, W, C).permute(0, 4, 1, 2,
+                                                     3).contiguous()
+    x = pool(x)
+    out_size = x.shape[2:]
+
+    # (B*num_heads, C, T', H', W') -> (B, num_heads, T'*H'*W', C)
+    x = x.reshape(B, num_heads, C, -1).transpose(2, 3)
+
+    if with_cls_token:
+        x = torch.cat((cls_tok, x), dim=2)
+
+    if norm is not None:
+        x = norm(x)
+
+    if ndim == 3:
+        x = x.squeeze(1)
+
+    return x, out_size
+
+
+class MultiScaleAttention(BaseModule):
+    """Multiscale Multi-head Attention block.
+
+    Args:
+        in_dims (int): Number of input channels.
+        out_dims (int): Number of output channels.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool): If True, add a learnable bias to query, key and
+            value. Defaults to True.
+        norm_cfg (dict): The config of normalization layers.
+            Defaults to ``dict(type='LN')``.
+        pool_kernel (tuple): kernel size for qkv pooling layers.
+            Defaults to (3, 3, 3).
+        stride_q (int): stride size for q pooling layer.
+            Defaults to (1, 1, 1).
+        stride_kv (int): stride size for kv pooling layer.
+            Defaults to (1, 1, 1).
+        rel_pos_embed (bool): Whether to enable the spatial and temporal
+            relative position embedding. Defaults to True.
+        residual_pooling (bool): Whether to enable the residual connection
+            after attention pooling. Defaults to True.
+        input_size (Tuple[int], optional): The input resolution, necessary
+            if enable the ``rel_pos_embed``. Defaults to None.
+        rel_pos_zero_init (bool): If True, zero initialize relative
+            positional parameters. Defaults to False.
+        with_cls_token (bool): Whether concatenating class token into video
+            tokens as transformer input. Defaults to True.
+        init_cfg (dict, optional): The config of weight initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_dims,
+                 out_dims,
+                 num_heads,
+                 qkv_bias=True,
+                 norm_cfg=dict(type='LN'),
+                 pool_kernel=(3, 3, 3),
+                 stride_q=(1, 1, 1),
+                 stride_kv=(1, 1, 1),
+                 rel_pos_embed=True,
+                 residual_pooling=True,
+                 input_size=None,
+                 rel_pos_zero_init=False,
+                 with_cls_token=True,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.num_heads = num_heads
+        self.with_cls_token = with_cls_token
+        self.in_dims = in_dims
+        self.out_dims = out_dims
+
+        head_dim = out_dims // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(in_dims, out_dims * 3, bias=qkv_bias)
+        self.proj = nn.Linear(out_dims, out_dims)
+
+        # qkv pooling
+        pool_padding = [k // 2 for k in pool_kernel]
+        pool_dims = out_dims // num_heads
+
+        def build_pooling(stride):
+            pool = nn.Conv3d(
+                pool_dims,
+                pool_dims,
+                pool_kernel,
+                stride=stride,
+                padding=pool_padding,
+                groups=pool_dims,
+                bias=False,
+            )
+            norm = build_norm_layer(norm_cfg, pool_dims)[1]
+            return pool, norm
+
+        self.pool_q, self.norm_q = build_pooling(stride_q)
+        self.pool_k, self.norm_k = build_pooling(stride_kv)
+        self.pool_v, self.norm_v = build_pooling(stride_kv)
+
+        self.residual_pooling = residual_pooling
+
+        self.rel_pos_embed = rel_pos_embed
+        self.rel_pos_zero_init = rel_pos_zero_init
+        if self.rel_pos_embed:
+            # initialize relative positional embeddings
+            assert input_size[1] == input_size[2]
+
+            size = input_size[1]
+            rel_dim = 2 * max(size // stride_q[1], size // stride_kv[1]) - 1
+            self.rel_pos_h = nn.Parameter(torch.zeros(rel_dim, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(rel_dim, head_dim))
+            self.rel_pos_t = nn.Parameter(
+                torch.zeros(2 * input_size[0] - 1, head_dim))
+
+    def init_weights(self):
+        """Weight initialization."""
+        super().init_weights()
+
+        if (isinstance(self.init_cfg, dict)
+                and self.init_cfg['type'] == 'Pretrained'):
+            # Suppress rel_pos_zero_init if use pretrained model.
+            return
+
+        if not self.rel_pos_zero_init:
+            trunc_normal_(self.rel_pos_h, std=0.02)
+            trunc_normal_(self.rel_pos_w, std=0.02)
+        if not self.rel_pos_zero_init:
+            trunc_normal_(self.rel_pos_t, std=0.02)
+
+    def forward(self, x, in_size):
+        """Forward the MultiScaleAttention."""
+        B, N, _ = x.shape  # (B, H*W, C)
+
+        # qkv: (B, H*W, 3, num_heads, C)
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, -1)
+        # q, k, v: (B, num_heads, H*W, C)
+        q, k, v = qkv.permute(2, 0, 3, 1, 4).unbind(0)
+
+        q, q_shape = attention_pool(
+            q,
+            self.pool_q,
+            in_size,
+            norm=self.norm_q,
+            with_cls_token=self.with_cls_token)
+        k, k_shape = attention_pool(
+            k,
+            self.pool_k,
+            in_size,
+            norm=self.norm_k,
+            with_cls_token=self.with_cls_token)
+        v, v_shape = attention_pool(
+            v,
+            self.pool_v,
+            in_size,
+            norm=self.norm_v,
+            with_cls_token=self.with_cls_token)
+
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+        if self.rel_pos_embed:
+            attn = add_decomposed_rel_pos(attn, q, q_shape, k_shape,
+                                          self.rel_pos_h, self.rel_pos_w,
+                                          self.rel_pos_t, self.with_cls_token)
+
+        attn = attn.softmax(dim=-1)
+        x = attn @ v
+
+        if self.residual_pooling:
+            if self.with_cls_token:
+                x[:, :, 1:, :] += q[:, :, 1:, :]
+            else:
+                x = x + q
+
+        # (B, num_heads, H'*W', C'//num_heads) -> (B, H'*W', C')
+        x = x.transpose(1, 2).reshape(B, -1, self.out_dims)
+        x = self.proj(x)
+
+        return x, q_shape
+
+
+class MultiScaleBlock(BaseModule):
+    """Multiscale Transformer blocks.
+
+    Args:
+        in_dims (int): Number of input channels.
+        out_dims (int): Number of output channels.
+        num_heads (int): Number of attention heads.
+        mlp_ratio (float): Ratio of hidden dimensions in MLP layers.
+            Defaults to 4.0.
+        qkv_bias (bool): If True, add a learnable bias to query, key and
+            value. Defaults to True.
+        drop_path (float): Stochastic depth rate. Defaults to 0.
+        norm_cfg (dict): The config of normalization layers.
+            Defaults to ``dict(type='LN')``.
+        act_cfg (dict): The config of activation function.
+            Defaults to ``dict(type='GELU')``.
+        qkv_pool_kernel (tuple): kernel size for qkv pooling layers.
+            Defaults to (3, 3, 3).
+        stride_q (int): stride size for q pooling layer.
+            Defaults to (1, 1, 1).
+        stride_kv (int): stride size for kv pooling layer.
+            Defaults to (1, 1, 1).
+        rel_pos_embed (bool): Whether to enable the spatial relative
+            position embedding. Defaults to True.
+        residual_pooling (bool): Whether to enable the residual connection
+            after attention pooling. Defaults to True.
+        with_cls_token (bool): Whether concatenating class token into video
+            tokens as transformer input. Defaults to True.
+        dim_mul_in_attention (bool): Whether to multiply the ``embed_dims`` in
+            attention layers. If False, multiply it in MLP layers.
+            Defaults to True.
+        input_size (Tuple[int], optional): The input resolution, necessary
+            if enable the ``rel_pos_embed``. Defaults to None.
+        rel_pos_zero_init (bool): If True, zero initialize relative
+            positional parameters. Defaults to False.
+        init_cfg (dict, optional): The config of weight initialization.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        in_dims,
+        out_dims,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_path=0.0,
+        norm_cfg=dict(type='LN'),
+        act_cfg=dict(type='GELU'),
+        qkv_pool_kernel=(3, 3, 3),
+        stride_q=(1, 1, 1),
+        stride_kv=(1, 1, 1),
+        rel_pos_embed=True,
+        residual_pooling=True,
+        with_cls_token=True,
+        dim_mul_in_attention=True,
+        input_size=None,
+        rel_pos_zero_init=False,
+        init_cfg=None,
+    ):
+        super().__init__(init_cfg=init_cfg)
+        self.with_cls_token = with_cls_token
+        self.in_dims = in_dims
+        self.out_dims = out_dims
+        self.norm1 = build_norm_layer(norm_cfg, in_dims)[1]
+        self.dim_mul_in_attention = dim_mul_in_attention
+
+        attn_dims = out_dims if dim_mul_in_attention else in_dims
+        self.attn = MultiScaleAttention(
+            in_dims,
+            attn_dims,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            norm_cfg=norm_cfg,
+            pool_kernel=qkv_pool_kernel,
+            stride_q=stride_q,
+            stride_kv=stride_kv,
+            rel_pos_embed=rel_pos_embed,
+            residual_pooling=residual_pooling,
+            input_size=input_size,
+            rel_pos_zero_init=rel_pos_zero_init)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.norm2 = build_norm_layer(norm_cfg, attn_dims)[1]
+
+        self.mlp = MLP(
+            in_channels=attn_dims,
+            hidden_channels=int(attn_dims * mlp_ratio),
+            out_channels=out_dims,
+            act_cfg=act_cfg)
+
+        if in_dims != out_dims:
+            self.proj = nn.Linear(in_dims, out_dims)
+        else:
+            self.proj = None
+
+        if np.prod(stride_q) > 1:
+            kernel_skip = [s + 1 if s > 1 else s for s in stride_q]
+            padding_skip = [int(skip // 2) for skip in kernel_skip]
+            self.pool_skip = nn.MaxPool3d(
+                kernel_skip, stride_q, padding_skip, ceil_mode=False)
+
+            if input_size is not None:
+                input_size = to_3tuple(input_size)
+                out_size = [size // s for size, s in zip(input_size, stride_q)]
+                self.init_out_size = out_size
+            else:
+                self.init_out_size = None
+        else:
+            self.pool_skip = None
+            self.init_out_size = input_size
+
+    def forward(self, x, in_size):
+        x_norm = self.norm1(x)
+        x_attn, out_size = self.attn(x_norm, in_size)
+
+        if self.dim_mul_in_attention and self.proj is not None:
+            skip = self.proj(x_norm)
+        else:
+            skip = x
+
+        if self.pool_skip is not None:
+            skip, _ = attention_pool(
+                skip,
+                self.pool_skip,
+                in_size,
+                with_cls_token=self.with_cls_token)
+
+        x = skip + self.drop_path(x_attn)
+        x_norm = self.norm2(x)
+        x_mlp = self.mlp(x_norm)
+
+        if not self.dim_mul_in_attention and self.proj is not None:
+            skip = self.proj(x_norm)
+        else:
+            skip = x
+
+        x = skip + self.drop_path(x_mlp)
+
+        return x, out_size
+
+
+@MODELS.register_module()
+class MViT(BaseModule):
+    """Multi-scale ViT v2.
+
+    A PyTorch implement of : `MViTv2: Improved Multiscale Vision Transformers
+    for Classification and Detection <https://arxiv.org/abs/2112.01526>`_
+
+    Inspiration from `the official implementation
+    <https://github.com/facebookresearch/SlowFast>`_ and `the mmclassification
+    implementation <https://github.com/open-mmlab/mmclassification>`_
+
+    Args:
+        arch (str | dict): MViT architecture. If use string, choose
+            from 'tiny', 'small', 'base' and 'large'. If use dict, it should
+            have below keys:
+
+            - **embed_dims** (int): The dimensions of embedding.
+            - **num_layers** (int): The number of layers.
+            - **num_heads** (int): The number of heads in attention
+              modules of the initial layer.
+            - **downscale_indices** (List[int]): The layer indices to downscale
+              the feature map.
+
+            Defaults to 'base'.
+        spatial_size (int): The expected input spatial_size shape.
+            Defaults to 224.
+        temporal_size (int): The expected input temporal_size shape.
+            Defaults to 224.
+        in_channels (int): The num of input channels. Defaults to 3.
+        out_scales (int | Sequence[int]): The output scale indices.
+            They should not exceed the length of ``downscale_indices``.
+            Defaults to -1, which means the last scale.
+        drop_path_rate (float): Stochastic depth rate. Defaults to 0.1.
+        use_abs_pos_embed (bool): If True, add absolute position embedding to
+            the patch embedding. Defaults to False.
+        interpolate_mode (str): Select the interpolate mode for absolute
+            position embedding vector resize. Defaults to "trilinear".
+        pool_kernel (tuple): kernel size for qkv pooling layers.
+            Defaults to (3, 3, 3).
+        dim_mul (int): The magnification for ``embed_dims`` in the downscale
+            layers. Defaults to 2.
+        head_mul (int): The magnification for ``num_heads`` in the downscale
+            layers. Defaults to 2.
+        adaptive_kv_stride (int): The stride size for kv pooling in the initial
+            layer. Defaults to (1, 8, 8).
+        rel_pos_embed (bool): Whether to enable the spatial and temporal
+            relative position embedding. Defaults to True.
+        residual_pooling (bool): Whether to enable the residual connection
+            after attention pooling. Defaults to True.
+        dim_mul_in_attention (bool): Whether to multiply the ``embed_dims`` in
+            attention layers. If False, multiply it in MLP layers.
+            Defaults to True.
+        with_cls_token (bool): Whether concatenating class token into video
+            tokens as transformer input. Defaults to True.
+        output_cls_token (bool): Whether output the cls_token. If set True,
+            ``with_cls_token`` must be True. Defaults to True.
+        rel_pos_zero_init (bool): If True, zero initialize relative
+            positional parameters. Defaults to False.
+        mlp_ratio (float): Ratio of hidden dimensions in MLP layers.
+            Defaults to 4.0.
+        qkv_bias (bool): enable bias for qkv if True. Defaults to True.
+        norm_cfg (dict): Config dict for normalization layer for all output
+            features. Defaults to ``dict(type='LN', eps=1e-6)``.
+        patch_cfg (dict): Config dict for the patch embedding layer.
+            Defaults to
+            ``dict(kernel_size=(3, 7, 7),
+                   stride=(2, 4, 4),
+                   padding=(1, 3, 3))``.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+
+    Examples:
+        >>> import torch
+        >>> from mmaction.registry import MODELS
+        >>> from mmaction.utils import register_all_modules
+        >>> register_all_modules()
+        >>>
+        >>> cfg = dict(type='MViT', arch='tiny', out_scales=[0, 1, 2, 3])
+        >>> model = model = MODELS.build(cfg)
+        >>> inputs = torch.rand(1, 3, 16, 224, 224)
+        >>> outputs = model(inputs)
+        >>> for i, output in enumerate(outputs):
+        >>>     print(f'scale{i}: {output.shape}')
+        scale0: torch.Size([1, 96, 8, 56, 56])
+        scale1: torch.Size([1, 192, 8, 28, 28])
+        scale2: torch.Size([1, 384, 8, 14, 14])
+        scale3: torch.Size([1, 768, 8, 7, 7])
+    """
+    arch_zoo = {
+        'tiny': {
+            'embed_dims': 96,
+            'num_layers': 10,
+            'num_heads': 1,
+            'downscale_indices': [1, 3, 8]
+        },
+        'small': {
+            'embed_dims': 96,
+            'num_layers': 16,
+            'num_heads': 1,
+            'downscale_indices': [1, 3, 14]
+        },
+        'base': {
+            'embed_dims': 96,
+            'num_layers': 24,
+            'num_heads': 1,
+            'downscale_indices': [2, 5, 21]
+        },
+        'large': {
+            'embed_dims': 144,
+            'num_layers': 48,
+            'num_heads': 2,
+            'downscale_indices': [2, 8, 44]
+        },
+    }
+    num_extra_tokens = 1
+
+    def __init__(self,
+                 arch='base',
+                 spatial_size=224,
+                 temporal_size=16,
+                 in_channels=3,
+                 pretrained=None,
+                 out_scales=-1,
+                 drop_path_rate=0.,
+                 use_abs_pos_embed=False,
+                 interpolate_mode='trilinear',
+                 pool_kernel=(3, 3, 3),
+                 dim_mul=2,
+                 head_mul=2,
+                 adaptive_kv_stride=(1, 8, 8),
+                 rel_pos_embed=True,
+                 residual_pooling=True,
+                 dim_mul_in_attention=True,
+                 with_cls_token=True,
+                 output_cls_token=True,
+                 rel_pos_zero_init=False,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 norm_cfg=dict(type='LN', eps=1e-6),
+                 patch_cfg=dict(
+                     kernel_size=(3, 7, 7),
+                     stride=(2, 4, 4),
+                     padding=(1, 3, 3)),
+                 init_cfg=None):
+        if pretrained:
+            init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        super().__init__(init_cfg=init_cfg)
+
+        if isinstance(arch, str):
+            arch = arch.lower()
+            assert arch in set(self.arch_zoo), \
+                f'Arch {arch} is not in default archs {set(self.arch_zoo)}'
+            self.arch_settings = self.arch_zoo[arch]
+        else:
+            essential_keys = {
+                'embed_dims', 'num_layers', 'num_heads', 'downscale_indices'
+            }
+            assert isinstance(arch, dict) and essential_keys <= set(arch), \
+                f'Custom arch needs a dict with keys {essential_keys}'
+            self.arch_settings = arch
+
+        self.embed_dims = self.arch_settings['embed_dims']
+        self.num_layers = self.arch_settings['num_layers']
+        self.num_heads = self.arch_settings['num_heads']
+        self.downscale_indices = self.arch_settings['downscale_indices']
+        self.num_scales = len(self.downscale_indices) + 1
+        self.stage_indices = {
+            index - 1: i
+            for i, index in enumerate(self.downscale_indices)
+        }
+        self.stage_indices[self.num_layers - 1] = self.num_scales - 1
+        self.use_abs_pos_embed = use_abs_pos_embed
+        self.interpolate_mode = interpolate_mode
+
+        if isinstance(out_scales, int):
+            out_scales = [out_scales]
+        assert isinstance(out_scales, Sequence), \
+            f'"out_scales" must by a sequence or int, ' \
+            f'get {type(out_scales)} instead.'
+        for i, index in enumerate(out_scales):
+            if index < 0:
+                out_scales[i] = self.num_scales + index
+            assert 0 <= out_scales[i] <= self.num_scales, \
+                f'Invalid out_scales {index}'
+        self.out_scales = sorted(list(out_scales))
+
+        # Set patch embedding
+        _patch_cfg = dict(
+            in_channels=in_channels,
+            input_size=(temporal_size, spatial_size, spatial_size),
+            embed_dims=self.embed_dims,
+            conv_type='Conv3d',
+        )
+        _patch_cfg.update(patch_cfg)
+        self.patch_embed = PatchEmbed3D(**_patch_cfg)
+        self.patch_resolution = self.patch_embed.init_out_size
+
+        # Set cls token
+        if output_cls_token:
+            assert with_cls_token is True, f'with_cls_token must be True if' \
+                f'set output_cls_token to True, but got {with_cls_token}'
+        self.with_cls_token = with_cls_token
+        self.output_cls_token = output_cls_token
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dims))
+
+        # Set absolute position embedding
+        if self.use_abs_pos_embed:
+            num_patches = np.prod(self.patch_resolution)
+            self.pos_embed = nn.Parameter(
+                torch.zeros(1, num_patches + self.num_extra_tokens,
+                            self.embed_dims))
+
+        # stochastic depth decay rule
+        dpr = np.linspace(0, drop_path_rate, self.num_layers)
+
+        self.blocks = ModuleList()
+        out_dims_list = [self.embed_dims]
+        num_heads = self.num_heads
+        stride_kv = adaptive_kv_stride
+        input_size = self.patch_resolution
+        for i in range(self.num_layers):
+            if i in self.downscale_indices:
+                num_heads *= head_mul
+                stride_q = [1, 2, 2]
+                stride_kv = [max(s // 2, 1) for s in stride_kv]
+            else:
+                stride_q = [1, 1, 1]
+
+            # Set output embed_dims
+            if dim_mul_in_attention and i in self.downscale_indices:
+                # multiply embed_dims in downscale layers.
+                out_dims = out_dims_list[-1] * dim_mul
+            elif not dim_mul_in_attention and i + 1 in self.downscale_indices:
+                # multiply embed_dims before downscale layers.
+                out_dims = out_dims_list[-1] * dim_mul
+            else:
+                out_dims = out_dims_list[-1]
+
+            attention_block = MultiScaleBlock(
+                in_dims=out_dims_list[-1],
+                out_dims=out_dims,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop_path=dpr[i],
+                norm_cfg=norm_cfg,
+                qkv_pool_kernel=pool_kernel,
+                stride_q=stride_q,
+                stride_kv=stride_kv,
+                rel_pos_embed=rel_pos_embed,
+                residual_pooling=residual_pooling,
+                dim_mul_in_attention=dim_mul_in_attention,
+                input_size=input_size,
+                rel_pos_zero_init=rel_pos_zero_init)
+            self.blocks.append(attention_block)
+
+            input_size = attention_block.init_out_size
+            out_dims_list.append(out_dims)
+
+            if i in self.stage_indices:
+                stage_index = self.stage_indices[i]
+                if stage_index in self.out_scales:
+                    norm_layer = build_norm_layer(norm_cfg, out_dims)[1]
+                    self.add_module(f'norm{stage_index}', norm_layer)
+
+    def init_weights(self):
+        super().init_weights()
+
+        if (isinstance(self.init_cfg, dict)
+                and self.init_cfg['type'] == 'Pretrained'):
+            # Suppress default init if use pretrained model.
+            return
+
+        if self.use_abs_pos_embed:
+            trunc_normal_(self.pos_embed, std=0.02)
+
+    def forward(self, x):
+        """Forward the MViT."""
+        B = x.shape[0]
+        x, patch_resolution = self.patch_embed(x)
+
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+
+        if self.use_abs_pos_embed:
+            x = x + resize_pos_embed(
+                self.pos_embed,
+                self.patch_resolution,
+                patch_resolution,
+                mode=self.interpolate_mode,
+                num_extra_tokens=self.num_extra_tokens)
+
+        if not self.with_cls_token:
+            # Remove class token for transformer encoder input
+            x = x[:, 1:]
+
+        outs = []
+        for i, block in enumerate(self.blocks):
+            x, patch_resolution = block(x, patch_resolution)
+
+            if i in self.stage_indices:
+                stage_index = self.stage_indices[i]
+                if stage_index in self.out_scales:
+                    B, _, C = x.shape
+                    x = getattr(self, f'norm{stage_index}')(x)
+                    tokens = x.transpose(1, 2)
+                    if self.with_cls_token:
+                        patch_token = tokens[:, :, 1:].reshape(
+                            B, C, *patch_resolution)
+                        cls_token = tokens[:, :, 0]
+                    else:
+                        patch_token = tokens.reshape(B, C, *patch_resolution)
+                        cls_token = None
+                    if self.output_cls_token:
+                        out = [patch_token, cls_token]
+                    else:
+                        out = patch_token
+                    outs.append(out)
+
+        return tuple(outs)
diff --git a/mmaction/models/heads/__init__.py b/mmaction/models/heads/__init__.py
index 79f852dc26..3395f96fe3 100644
--- a/mmaction/models/heads/__init__.py
+++ b/mmaction/models/heads/__init__.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .base import BaseHead
 from .i3d_head import I3DHead
+from .mvit_head import MVitHead
 from .slowfast_head import SlowFastHead
 from .stgcn_head import STGCNHead
 from .timesformer_head import TimeSformerHead
@@ -13,5 +14,6 @@
 
 __all__ = [
     'TSNHead', 'I3DHead', 'BaseHead', 'TSMHead', 'SlowFastHead', 'TPNHead',
-    'X3DHead', 'TRNHead', 'TimeSformerHead', 'STGCNHead', 'TSNAudioHead'
+    'X3DHead', 'TRNHead', 'TimeSformerHead', 'STGCNHead', 'TSNAudioHead',
+    'MVitHead'
 ]
diff --git a/mmaction/models/heads/mvit_head.py b/mmaction/models/heads/mvit_head.py
new file mode 100644
index 0000000000..eac4d30266
--- /dev/null
+++ b/mmaction/models/heads/mvit_head.py
@@ -0,0 +1,71 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+from mmengine.model.weight_init import trunc_normal_init
+from torch import Tensor, nn
+
+from mmaction.registry import MODELS
+from mmaction.utils import ConfigType
+from .base import BaseHead
+
+
+@MODELS.register_module()
+class MVitHead(BaseHead):
+    """Classification head for TimeSformer.
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        loss_cls (dict or ConfigDict): Config for building loss.
+            Defaults to `dict(type='CrossEntropyLoss')`.
+        dropout_ratio (float): Probability of dropout layer. Default: 0.5.
+        init_std (float): Std value for Initiation. Defaults to 0.02.
+        kwargs (dict, optional): Any keyword argument to be used to initialize
+            the head.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 loss_cls: ConfigType = dict(type='CrossEntropyLoss'),
+                 dropout_ratio: float = 0.5,
+                 init_std: float = 0.02,
+                 **kwargs) -> None:
+        super().__init__(num_classes, in_channels, loss_cls, **kwargs)
+        self.init_std = init_std
+        self.dropout_ratio = dropout_ratio
+        if self.dropout_ratio != 0:
+            self.dropout = nn.Dropout(p=self.dropout_ratio)
+        else:
+            self.dropout = None
+        self.fc_cls = nn.Linear(self.in_channels, self.num_classes)
+
+    def init_weights(self) -> None:
+        """Initiate the parameters from scratch."""
+        trunc_normal_init(self.fc_cls, std=self.init_std)
+
+    def pre_logits(self, feats: Tuple[List[Tensor]]) -> Tensor:
+        """The process before the final classification head.
+
+        The input ``feats`` is a tuple of list of tensor, and each tensor is
+        the feature of a backbone stage.
+        """
+        _, cls_token = feats[-1]
+        return cls_token
+
+    def forward(self, x: Tuple[List[Tensor]], **kwargs) -> Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tuple[List[Tensor]]): The input data.
+
+        Returns:
+            Tensor: The classification scores for input samples.
+        """
+        x = self.pre_logits(x)
+        if self.dropout is not None:
+            x = self.dropout(x)
+        # [N, in_channels]
+        cls_score = self.fc_cls(x)
+        # [N, num_classes]
+        return cls_score
diff --git a/mmaction/models/recognizers/recognizer3d.py b/mmaction/models/recognizers/recognizer3d.py
index 9de211d618..bb7e250157 100644
--- a/mmaction/models/recognizers/recognizer3d.py
+++ b/mmaction/models/recognizers/recognizer3d.py
@@ -69,16 +69,21 @@ def extract_feat(self,
                         feat, _ = self.neck(feat)
                     feats.append(feat)
                     view_ptr += max_testing_views
-                # should consider the case that feat is a tuple
-                if isinstance(feats[0], tuple):
-                    len_tuple = len(feats[0])
-                    feats = [
-                        torch.cat([each[i] for each in feats])
-                        for i in range(len_tuple)
-                    ]
-                    x = tuple(feats)
-                else:
-                    x = torch.cat(feats)
+                # recursively traverse feats until it's a tensor, then concat
+
+                def recursively_cat(feats):
+                    out_feats = []
+                    for e_idx, elem in enumerate(feats[0]):
+                        batch_elem = [feat[e_idx] for feat in feats]
+                        if not isinstance(elem, torch.Tensor):
+                            batch_elem = recursively_cat(batch_elem)
+                        else:
+                            batch_elem = torch.cat(batch_elem)
+                        out_feats.append(batch_elem)
+
+                    return tuple(out_feats)
+
+                x = recursively_cat(feats)
             else:
                 x = self.backbone(inputs)
                 if self.with_neck:
diff --git a/mmaction/models/utils/__init__.py b/mmaction/models/utils/__init__.py
index ed6ac50522..865ccbea99 100644
--- a/mmaction/models/utils/__init__.py
+++ b/mmaction/models/utils/__init__.py
@@ -1,6 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .blending_utils import (BaseMiniBatchBlending, CutmixBlending,
-                             MixupBlending)
+                             MixupBlending, RandomBatchAugment)
 from .graph import Graph
 
-__all__ = ['BaseMiniBatchBlending', 'CutmixBlending', 'MixupBlending', 'Graph']
+__all__ = [
+    'BaseMiniBatchBlending', 'CutmixBlending', 'MixupBlending', 'Graph',
+    'RandomBatchAugment'
+]
diff --git a/mmaction/models/utils/blending_utils.py b/mmaction/models/utils/blending_utils.py
index 94b929d7ff..64808d32f7 100644
--- a/mmaction/models/utils/blending_utils.py
+++ b/mmaction/models/utils/blending_utils.py
@@ -1,6 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABCMeta, abstractmethod
+from typing import Union
 
+import numpy as np
 import torch
 import torch.nn.functional as F
 from torch import Tensor
@@ -177,3 +179,69 @@ def do_blending(self, imgs: Tensor, label: Tensor, **kwargs) -> tuple:
         label = lam * label + (1 - lam) * label[rand_index, :]
 
         return imgs, label
+
+
+@MODELS.register_module()
+class RandomBatchAugment(BaseMiniBatchBlending):
+    """Randomly choose one batch augmentation to apply.
+
+    Args:
+        augments (dict | list): configs of batch
+            augmentations.
+        probs (float | List[float] | None): The probabilities of each batch
+            augmentations. If None, choose evenly. Defaults to None.
+
+    Example:
+        >>> augments_cfg = [
+        ...     dict(type='CutmixBlending', alpha=1., num_classes=10),
+        ...     dict(type='MixupBlending', alpha=1., num_classes=10)
+        ... ]
+        >>> batch_augment = RandomBatchAugment(augments_cfg, probs=[0.5, 0.3])
+        >>> imgs = torch.randn(16, 3, 8, 32, 32)
+        >>> label = torch.randint(0, 10, (16, ))
+        >>> imgs, label = batch_augment(imgs, label)
+
+    .. note ::
+
+        To decide which batch augmentation will be used, it picks one of
+        ``augments`` based on the probabilities. In the example above, the
+        probability to use CutmixBlending is 0.5, to use MixupBlending is 0.3,
+        and to do nothing is 0.2.
+    """
+
+    def __init__(self, augments: Union[dict, list], probs=None):
+        if not isinstance(augments, (tuple, list)):
+            augments = [augments]
+
+        self.augments = []
+        for aug in augments:
+            assert isinstance(aug, dict), \
+                f'blending augment config must be a dict. Got {type(aug)}'
+            self.augments.append(MODELS.build(aug))
+
+        self.num_classes = augments[0].get('num_classes')
+
+        if isinstance(probs, float):
+            probs = [probs]
+
+        if probs is not None:
+            assert len(augments) == len(probs), \
+                '``augments`` and ``probs`` must have same lengths. ' \
+                f'Got {len(augments)} vs {len(probs)}.'
+            assert sum(probs) <= 1, \
+                'The total probability of batch augments exceeds 1.'
+            self.augments.append(None)
+            probs.append(1 - sum(probs))
+
+        self.probs = probs
+
+    def do_blending(self, imgs: Tensor, label: Tensor, **kwargs) -> tuple:
+        """Randomly apply batch augmentations to the batch inputs and batch
+        data samples."""
+        aug_index = np.random.choice(len(self.augments), p=self.probs)
+        aug = self.augments[aug_index]
+
+        if aug is not None:
+            return aug.do_blending(imgs, label, **kwargs)
+        else:
+            return imgs, label
diff --git a/mmaction/models/utils/embed.py b/mmaction/models/utils/embed.py
new file mode 100644
index 0000000000..bfe805fb32
--- /dev/null
+++ b/mmaction/models/utils/embed.py
@@ -0,0 +1,234 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule
+from mmengine.utils import to_3tuple
+
+
+class AdaptivePadding(nn.Module):
+    """Applies padding adaptively to the input.
+
+    This module can make input get fully covered by filter
+    you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad
+    zero around input. The "corner"  mode would pad zero
+    to bottom right.
+
+    Args:
+        kernel_size (int | tuple): Size of the kernel. Default: 1.
+        stride (int | tuple): Stride of the filter. Default: 1.
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1.
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+        super().__init__()
+        assert padding in ('same', 'corner')
+
+        kernel_size = to_3tuple(kernel_size)
+        stride = to_3tuple(stride)
+        dilation = to_3tuple(dilation)
+
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+
+    def get_pad_shape(self, input_shape):
+        """Calculate the padding size of input.
+
+        Args:
+            input_shape (:obj:`torch.Size`): arrange as (H, W).
+
+        Returns:
+            Tuple[int]: The padding size along the
+            original H and W directions
+        """
+        input_t, input_h, input_w = input_shape
+        kernel_d, kernel_h, kernel_w = self.kernel_size
+        stride_d, stride_h, stride_w = self.stride
+        output_d = math.ceil(input_t / stride_d)
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_d = max((output_d - 1) * stride_d +
+                    (kernel_d - 1) * self.dilation[0] + 1 - input_t, 0)
+        pad_h = max((output_h - 1) * stride_h +
+                    (kernel_h - 1) * self.dilation[1] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w +
+                    (kernel_w - 1) * self.dilation[2] + 1 - input_w, 0)
+        return pad_d, pad_h, pad_w
+
+    def forward(self, x):
+        """Add padding to `x`
+
+        Args:
+            x (Tensor): Input tensor has shape (B, C, H, W).
+
+        Returns:
+            Tensor: The tensor with adaptive padding
+        """
+        pad_d, pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_d > 0 or pad_h > 0 or pad_w > 0:
+            if self.padding == 'corner':
+                x = F.pad(x, [0, pad_w, 0, pad_h, 0, pad_d])
+            elif self.padding == 'same':
+                x = F.pad(x, [
+                    pad_w // 2,
+                    pad_w - pad_w // 2,
+                    pad_h // 2,
+                    pad_h - pad_h // 2,
+                    pad_d // 2,
+                    pad_d - pad_d // 2,
+                ])
+        return x
+
+
+class PatchEmbed3D(BaseModule):
+    """Video to Patch Embedding.
+
+    We use a conv layer to implement PatchEmbed.
+
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (str): The type of convolution
+            to generate patch embedding. Default: "Conv3d".
+        kernel_size (int): The kernel_size of embedding conv.
+            Default: (2, 4, 4).
+        stride (int): The slide stride of embedding conv.
+            Default: (2, 4, 4).
+        padding (int | tuple | string): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        bias (bool): Bias of embed conv. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        input_size (int | tuple | None): The size of input, which will be
+            used to calculate the out size. Only works when `dynamic_size`
+            is False. Default: None.
+        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=768,
+                 conv_type='Conv3d',
+                 kernel_size=(2, 4, 4),
+                 stride=(2, 4, 4),
+                 padding='corner',
+                 dilation=1,
+                 bias=True,
+                 norm_cfg=None,
+                 input_size=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+
+        kernel_size = to_3tuple(kernel_size)
+        stride = to_3tuple(stride)
+        dilation = to_3tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adaptive_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of conv
+            padding = 0
+        else:
+            self.adaptive_padding = None
+        padding = to_3tuple(padding)
+
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+        if input_size:
+            input_size = to_3tuple(input_size)
+            # `init_out_size` would be used outside to
+            # calculate the num_patches
+            # e.g. when `use_abs_pos_embed` outside
+            self.init_input_size = input_size
+            if self.adaptive_padding:
+                pad_d, pad_h, pad_w = self.adaptive_padding.get_pad_shape(
+                    input_size)
+                input_t, input_h, input_w = input_size
+                input_t = input_t + pad_d
+                input_h = input_h + pad_h
+                input_w = input_w + pad_w
+                input_size = (input_t, input_h, input_w)
+
+            # https://pytorch.org/docs/stable/generated/torch.nn.Conv3d.html
+            t_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            h_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            w_out = (input_size[2] + 2 * padding[2] - dilation[2] *
+                     (kernel_size[2] - 1) - 1) // stride[2] + 1
+            self.init_out_size = (t_out, h_out, w_out)
+        else:
+            self.init_input_size = None
+            self.init_out_size = None
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Has shape (B, C, T, H, W). In most case, C is 3.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+            - x (Tensor): Has shape (B, out_t * out_h * out_w, embed_dims)
+            - out_size (tuple[int]): Spatial shape of x, arrange as
+              (out_t, out_h, out_w).
+        """
+
+        if self.adaptive_padding:
+            x = self.adaptive_padding(x)
+
+        x = self.projection(x)
+        out_size = (x.shape[2], x.shape[3], x.shape[4])
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
diff --git a/tests/datasets/transforms/test_pose_loading.py b/tests/datasets/transforms/test_pose_loading.py
index eeb2dad84c..fd7568798f 100644
--- a/tests/datasets/transforms/test_pose_loading.py
+++ b/tests/datasets/transforms/test_pose_loading.py
@@ -10,97 +10,11 @@
 from numpy.testing import assert_array_almost_equal, assert_array_equal
 
 from mmaction.datasets.transforms import (GeneratePoseTarget, LoadKineticsPose,
-                                          PaddingWithLoop, PoseDecode,
-                                          UniformSampleFrames)
+                                          PaddingWithLoop, PoseDecode)
 
 
 class TestPoseLoading:
 
-    @staticmethod
-    def test_uniform_sample_frames():
-        results = dict(total_frames=64, start_index=0)
-        sampling = UniformSampleFrames(
-            clip_len=8, num_clips=1, test_mode=True, seed=0)
-
-        assert str(sampling) == ('UniformSampleFrames(clip_len=8, '
-                                 'num_clips=1, test_mode=True, seed=0)')
-        sampling_results = sampling(results)
-        assert sampling_results['clip_len'] == 8
-        assert sampling_results['frame_interval'] is None
-        assert sampling_results['num_clips'] == 1
-        assert_array_equal(sampling_results['frame_inds'],
-                           np.array([4, 15, 21, 24, 35, 43, 51, 63]))
-
-        results = dict(total_frames=15, start_index=0)
-        sampling = UniformSampleFrames(
-            clip_len=8, num_clips=1, test_mode=True, seed=0)
-        sampling_results = sampling(results)
-        assert sampling_results['clip_len'] == 8
-        assert sampling_results['frame_interval'] is None
-        assert sampling_results['num_clips'] == 1
-        assert_array_equal(sampling_results['frame_inds'],
-                           np.array([0, 2, 4, 6, 8, 9, 11, 13]))
-
-        results = dict(total_frames=7, start_index=0)
-        sampling = UniformSampleFrames(
-            clip_len=8, num_clips=1, test_mode=True, seed=0)
-        sampling_results = sampling(results)
-        assert sampling_results['clip_len'] == 8
-        assert sampling_results['frame_interval'] is None
-        assert sampling_results['num_clips'] == 1
-        assert_array_equal(sampling_results['frame_inds'],
-                           np.array([0, 1, 2, 3, 4, 5, 6, 0]))
-
-        results = dict(total_frames=7, start_index=0)
-        sampling = UniformSampleFrames(
-            clip_len=8, num_clips=8, test_mode=True, seed=0)
-        sampling_results = sampling(results)
-        assert sampling_results['clip_len'] == 8
-        assert sampling_results['frame_interval'] is None
-        assert sampling_results['num_clips'] == 8
-        assert len(sampling_results['frame_inds']) == 64
-
-        results = dict(total_frames=64, start_index=0)
-        sampling = UniformSampleFrames(
-            clip_len=8, num_clips=4, test_mode=True, seed=0)
-        sampling_results = sampling(results)
-        assert sampling_results['clip_len'] == 8
-        assert sampling_results['frame_interval'] is None
-        assert sampling_results['num_clips'] == 4
-        assert_array_equal(
-            sampling_results['frame_inds'],
-            np.array([
-                4, 15, 21, 24, 35, 43, 51, 63, 1, 11, 21, 26, 36, 47, 54, 56,
-                0, 12, 18, 25, 38, 47, 55, 62, 0, 9, 21, 25, 37, 40, 49, 60
-            ]))
-
-        results = dict(total_frames=64, start_index=0)
-        sampling = UniformSampleFrames(
-            clip_len=8, num_clips=1, test_mode=False, seed=0)
-        sampling_results = sampling(results)
-        assert sampling_results['clip_len'] == 8
-        assert sampling_results['frame_interval'] is None
-        assert sampling_results['num_clips'] == 1
-        assert len(sampling_results['frame_inds']) == 8
-
-        results = dict(total_frames=7, start_index=0)
-        sampling = UniformSampleFrames(
-            clip_len=8, num_clips=1, test_mode=False, seed=0)
-        sampling_results = sampling(results)
-        assert sampling_results['clip_len'] == 8
-        assert sampling_results['frame_interval'] is None
-        assert sampling_results['num_clips'] == 1
-        assert len(sampling_results['frame_inds']) == 8
-
-        results = dict(total_frames=15, start_index=0)
-        sampling = UniformSampleFrames(
-            clip_len=8, num_clips=1, test_mode=False, seed=0)
-        sampling_results = sampling(results)
-        assert sampling_results['clip_len'] == 8
-        assert sampling_results['frame_interval'] is None
-        assert sampling_results['num_clips'] == 1
-        assert len(sampling_results['frame_inds']) == 8
-
     @staticmethod
     def test_pose_decode():
         kp = np.random.random([1, 16, 17, 2])
diff --git a/tests/datasets/transforms/test_sampling.py b/tests/datasets/transforms/test_sampling.py
index f4a5e457bd..9450682315 100644
--- a/tests/datasets/transforms/test_sampling.py
+++ b/tests/datasets/transforms/test_sampling.py
@@ -9,7 +9,8 @@
 
 from mmaction.datasets.transforms import (AudioFeatureSelector,
                                           DenseSampleFrames, SampleAVAFrames,
-                                          SampleFrames, UntrimmedSampleFrames)
+                                          SampleFrames, UniformSampleFrames,
+                                          UntrimmedSampleFrames)
 
 
 class BaseTestLoading:
@@ -401,6 +402,90 @@ def check_monotonous(arr):
         assert np.max(sample_frames_results['frame_inds']) <= 40
         assert np.min(sample_frames_results['frame_inds']) >= 1
 
+    def test_uniform_sample_frames(self):
+        results = dict(total_frames=64, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=1, test_mode=True, seed=0)
+
+        assert str(sampling) == ('UniformSampleFrames(clip_len=8, '
+                                 'num_clips=1, test_mode=True, seed=0)')
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert_array_equal(sampling_results['frame_inds'],
+                           np.array([4, 15, 21, 24, 35, 43, 51, 63]))
+
+        results = dict(total_frames=15, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=1, test_mode=True, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert_array_equal(sampling_results['frame_inds'],
+                           np.array([0, 2, 4, 6, 8, 9, 11, 13]))
+
+        results = dict(total_frames=7, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=1, test_mode=True, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert_array_equal(sampling_results['frame_inds'],
+                           np.array([0, 1, 2, 3, 4, 5, 6, 0]))
+
+        results = dict(total_frames=7, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=8, test_mode=True, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 8
+        assert len(sampling_results['frame_inds']) == 64
+
+        results = dict(total_frames=64, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=4, test_mode=True, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 4
+        assert_array_equal(
+            sampling_results['frame_inds'],
+            np.array([
+                4, 15, 21, 24, 35, 43, 51, 63, 1, 11, 21, 26, 36, 47, 54, 56,
+                0, 12, 18, 25, 38, 47, 55, 62, 0, 9, 21, 25, 37, 40, 49, 60
+            ]))
+
+        results = dict(total_frames=64, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=1, test_mode=False, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert len(sampling_results['frame_inds']) == 8
+
+        results = dict(total_frames=7, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=1, test_mode=False, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert len(sampling_results['frame_inds']) == 8
+
+        results = dict(total_frames=15, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=1, test_mode=False, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert len(sampling_results['frame_inds']) == 8
+
     def test_dense_sample_frames(self):
         target_keys = [
             'frame_inds', 'clip_len', 'frame_interval', 'num_clips',
diff --git a/tests/models/backbones/test_mvit.py b/tests/models/backbones/test_mvit.py
new file mode 100644
index 0000000000..4ebdbc26db
--- /dev/null
+++ b/tests/models/backbones/test_mvit.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from copy import deepcopy
+from unittest import TestCase
+
+import torch
+
+from mmaction.models import MViT
+
+
+class TestMViT(TestCase):
+
+    def setUp(self):
+        self.cfg = dict(arch='tiny', drop_path_rate=0.1)
+
+    def test_structure(self):
+        # Test invalid default arch
+        with self.assertRaisesRegex(AssertionError, 'not in default archs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = 'unknown'
+            MViT(**cfg)
+
+        # Test invalid custom arch
+        with self.assertRaisesRegex(AssertionError, 'Custom arch needs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = {
+                'num_layers': 24,
+                'num_heads': 16,
+                'feedforward_channels': 4096
+            }
+            MViT(**cfg)
+
+        # Test custom arch
+        cfg = deepcopy(self.cfg)
+        cfg['arch'] = {
+            'embed_dims': 96,
+            'num_layers': 10,
+            'num_heads': 1,
+            'downscale_indices': [2, 5, 8]
+        }
+        stage_indices = [0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
+        model = MViT(**cfg)
+        self.assertEqual(model.embed_dims, 96)
+        self.assertEqual(model.num_layers, 10)
+        for i, block in enumerate(model.blocks):
+            stage = stage_indices[i]
+            self.assertEqual(block.out_dims, 96 * 2**(stage))
+
+        # Test out_indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_scales'] = {1: 1}
+        with self.assertRaisesRegex(AssertionError, "get <class 'dict'>"):
+            MViT(**cfg)
+        cfg['out_scales'] = [0, 13]
+        with self.assertRaisesRegex(AssertionError, 'Invalid out_scales 13'):
+            MViT(**cfg)
+
+        # Test model structure
+        cfg = deepcopy(self.cfg)
+        model = MViT(**cfg)
+        stage_indices = [0, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3]
+        self.assertEqual(len(model.blocks), 10)
+        dpr_inc = 0.1 / (10 - 1)
+        dpr = 0
+        for i, block in enumerate(model.blocks):
+            stage = stage_indices[i]
+            print(i, stage)
+            self.assertEqual(block.attn.num_heads, 2**stage)
+            if dpr > 0:
+                self.assertAlmostEqual(block.drop_path.drop_prob, dpr)
+            dpr += dpr_inc
+
+    def test_init_weights(self):
+        # test weight init cfg
+        cfg = deepcopy(self.cfg)
+        cfg['init_cfg'] = [
+            dict(
+                type='Kaiming',
+                layer='Conv3d',
+                mode='fan_in',
+                nonlinearity='linear')
+        ]
+        cfg['use_abs_pos_embed'] = True
+        model = MViT(**cfg)
+        ori_weight = model.patch_embed.projection.weight.clone().detach()
+        # The pos_embed is all zero before initialize
+        self.assertTrue(torch.allclose(model.pos_embed, torch.tensor(0.)))
+
+        model.init_weights()
+        initialized_weight = model.patch_embed.projection.weight
+        self.assertFalse(torch.allclose(ori_weight, initialized_weight))
+        self.assertFalse(torch.allclose(model.pos_embed, torch.tensor(0.)))
+
+    def test_forward(self):
+        imgs = torch.randn(1, 3, 16, 224, 224)
+
+        cfg = deepcopy(self.cfg)
+        model = MViT(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        patch_token, cls_token = outs[-1]
+        self.assertEqual(patch_token.shape, (1, 768, 8, 7, 7))
+
+        # Test forward with multi out scales
+        cfg = deepcopy(self.cfg)
+        cfg['out_scales'] = (0, 1, 2, 3)
+        model = MViT(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 4)
+        for stage, out in enumerate(outs):
+            stride = 2**stage
+            patch_token, cls_token = out
+            self.assertEqual(patch_token.shape,
+                             (1, 96 * stride, 8, 56 // stride, 56 // stride))
+            self.assertEqual(cls_token.shape, (1, 96 * stride))
+
+        # Test forward with dynamic input size
+        imgs1 = torch.randn(1, 3, 16, 224, 224)
+        imgs2 = torch.randn(1, 3, 16, 256, 256)
+        imgs3 = torch.randn(1, 3, 16, 256, 309)
+        cfg = deepcopy(self.cfg)
+        model = MViT(**cfg)
+        for imgs in [imgs1, imgs2, imgs3]:
+            outs = model(imgs)
+            self.assertIsInstance(outs, tuple)
+            self.assertEqual(len(outs), 1)
+            patch_token, cls_token = outs[-1]
+            expect_feat_shape = (math.ceil(imgs.shape[2] / 2),
+                                 math.ceil(imgs.shape[3] / 32),
+                                 math.ceil(imgs.shape[4] / 32))
+            self.assertEqual(patch_token.shape, (1, 768, *expect_feat_shape))
+            self.assertEqual(cls_token.shape, (1, 768))
diff --git a/tests/models/utils/test_blending_utils.py b/tests/models/utils/test_blending_utils.py
index 2c19267681..359d9225dc 100644
--- a/tests/models/utils/test_blending_utils.py
+++ b/tests/models/utils/test_blending_utils.py
@@ -1,8 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
 import torch
 from mmengine.structures import LabelData
 
-from mmaction.models import CutmixBlending, MixupBlending
+from mmaction.models import CutmixBlending, MixupBlending, RandomBatchAugment
 from mmaction.structures import ActionDataSample
 
 
@@ -53,3 +55,41 @@ def test_cutmix():
     mixed_imgs, mixed_label = mixup(imgs, label)
     assert mixed_imgs.shape == torch.Size((4, 4, 2, 3, 32, 32))
     assert len(mixed_label) == 4
+
+
+def test_rand_blend():
+    alpha_mixup = 0.2
+    alpha_cutmix = 0.2
+    num_classes = 10
+    label = get_label(torch.randint(0, num_classes, (4, )))
+    blending_augs = [
+        dict(type='MixupBlending', alpha=alpha_mixup, num_classes=num_classes),
+        dict(
+            type='CutmixBlending', alpha=alpha_cutmix, num_classes=num_classes)
+    ]
+
+    # test assertion
+    with pytest.raises(AssertionError):
+        rand_mix = RandomBatchAugment(blending_augs, [0.5, 0.6])
+
+    # mixup, cutmix
+    rand_mix = RandomBatchAugment(blending_augs, probs=None)
+    assert rand_mix.probs is None
+
+    # mixup, cutmix and None
+    probs = [0.5, 0.4]
+    rand_mix = RandomBatchAugment(blending_augs, probs)
+
+    np.testing.assert_allclose(rand_mix.probs[-1], 0.1)
+
+    # test call
+    imgs = torch.randn(4, 4, 3, 32, 32)  # NCHW imgs
+    mixed_imgs, mixed_label = rand_mix(imgs, label)
+    assert mixed_imgs.shape == torch.Size((4, 4, 3, 32, 32))
+    assert len(mixed_label) == 4
+
+    imgs = torch.randn(4, 4, 2, 3, 32, 32)  # NCTHW imgs
+    label = get_label(torch.randint(0, num_classes, (4, )))
+    mixed_imgs, mixed_label = rand_mix(imgs, label)
+    assert mixed_imgs.shape == torch.Size((4, 4, 2, 3, 32, 32))
+    assert len(mixed_label) == 4

From 25072a145b8d646c57685656b0fb9b5ad11bb955 Mon Sep 17 00:00:00 2001
From: lilin <lilin@pjlab.org.cn>
Date: Fri, 21 Oct 2022 15:04:33 +0800
Subject: [PATCH 2/8] [feat] support mvit

---
 configs/_base_/models/mvit_small.py           |  12 +-
 configs/recognition/mvit/README.md            |  24 ++--
 configs/recognition/mvit/metafile.yml         | 115 ++++++++++++++++++
 .../mvit-base-p244_32x3x1_kinetics400-rgb.py  |  14 ++-
 .../mvit/mvit-base-p244_u32_sthv2-rgb.py      |  12 +-
 .../mvit-large-p244_40x3x1_kinetics400-rgb.py |  11 ++
 .../mvit/mvit-large-p244_u40_sthv2-rgb.py     |  12 +-
 .../mvit-small-p244_16x4x1_kinetics400-rgb.py |  13 ++
 .../mvit/mvit-small-p244_u16_sthv2-rgb.py     |   4 +-
 mmaction/datasets/transforms/loading.py       |  74 -----------
 mmaction/models/backbones/mvit.py             |  34 ++++--
 mmaction/models/heads/__init__.py             |   4 +-
 mmaction/models/heads/mvit_head.py            |   9 +-
 tests/models/backbones/test_mvit.py           |  12 +-
 tests/models/heads/test_mvit_head.py          |  32 +++++
 15 files changed, 260 insertions(+), 122 deletions(-)
 create mode 100644 configs/recognition/mvit/metafile.yml
 create mode 100644 tests/models/heads/test_mvit_head.py

diff --git a/configs/_base_/models/mvit_small.py b/configs/_base_/models/mvit_small.py
index 727df37c38..d6a94daa23 100644
--- a/configs/_base_/models/mvit_small.py
+++ b/configs/_base_/models/mvit_small.py
@@ -3,17 +3,11 @@
     backbone=dict(type='MViT', arch='small', drop_path_rate=0.2),
     data_preprocessor=dict(
         type='ActionDataPreprocessor',
-        mean=[114.75, 114.75, 114.75],
-        std=[57.375, 57.375, 57.375],
-        blending=dict(
-            type='RandomBatchAugment',
-            augments=[
-                dict(type='MixupBlending', alpha=0.8, num_classes=400),
-                dict(type='CutmixBlending', alpha=1, num_classes=400)
-            ]),
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
         format_shape='NCTHW'),
     cls_head=dict(
-        type='MVitHead',
+        type='MViTHead',
         in_channels=768,
         num_classes=400,
         label_smooth_eps=0.1,
diff --git a/configs/recognition/mvit/README.md b/configs/recognition/mvit/README.md
index fdc694a128..ccd9611c2d 100644
--- a/configs/recognition/mvit/README.md
+++ b/configs/recognition/mvit/README.md
@@ -27,21 +27,21 @@ well as 86.1% on Kinetics-400 video classification.
 
 ### Kinetics-400
 
-| frame sampling strategy |   resolution   |  backbone  |   pretrain   | top1 acc | top5 acc |        reference top1 acc        |        reference top1 acc        | testing protocol | params |        config        |        ckpt         |
-| :---------------------: | :------------: | :--------: | :----------: | :------: | :------: | :------------------------------: | :------------------------------: | :--------------: | :----: | :------------------: | :-----------------: |
-|         16x4x1          | short-side 320 | MViTv2-S\* | From scratch |   81.1   |   94.7   | [81.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [94.6](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 1 crop | xx.xM  | [config](/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) |
-|         32x3x1          | short-side 320 | MViTv2-B\* | From scratch |   82.6   |   95.8   | [82.9](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [95.7](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 1 crop | xx.xM  | [config](/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) |
-|         40x3x1          | short-side 320 | MViTv2-L\* | From scratch |   85.4   |   96.2   | [86.1](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [97.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 3 crop | xx.xM  | [config](/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) |
+| frame sampling strategy |   resolution   |  backbone  |   pretrain   | top1 acc | top5 acc |       reference top1 acc        |       reference top1 acc        | testing protocol | FLOPs | params |       config        |       ckpt        |
+| :---------------------: | :------------: | :--------: | :----------: | :------: | :------: | :-----------------------------: | :-----------------------------: | :--------------: | :---: | :----: | :-----------------: | :---------------: |
+|         16x4x1          | short-side 320 | MViTv2-S\* | From scratch |   81.1   |   94.7   | [81.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [94.6](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 1 crop |  64G  | 34.5M  | [config](/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-small-p244_16x4x1_kinetics400-rgb_20221021-9ebaaeed.pth) |
+|         32x3x1          | short-side 320 | MViTv2-B\* | From scratch |   82.6   |   95.8   | [82.9](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [95.7](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 1 crop | 225G  | 51.2M  | [config](/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-base-p244_32x3x1_kinetics400-rgb_20221021-f392cd2d.pth) |
+|         40x3x1          | short-side 320 | MViTv2-L\* | From scratch |   85.4   |   96.2   | [86.1](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [97.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 3 crop | 2828G |  213M  | [config](/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-large-p244_40x3x1_kinetics400-rgb_20221021-11fe1f97.pth) |
 
 ### Something-Something V2
 
-| frame sampling strategy |   resolution   |  backbone  |   pretrain   | top1 acc | top5 acc |        reference top1 acc        |        reference top1 acc        | testing protocol | params |        config        |        ckpt         |
-| :---------------------: | :------------: | :--------: | :----------: | :------: | :------: | :------------------------------: | :------------------------------: | :--------------: | :----: | :------------------: | :-----------------: |
-|       uniform 16        | short-side 320 | MViTv2-S\* |     K400     |   68.1   |   91.0   | [68.2](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [91.4](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crop | xx.xM  | [config](/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) |
-|       uniform 32        | short-side 320 | MViTv2-B\* |     K400     |   70.8   |   92.7   | [70.5](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [92.7](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crop | xx.xM  | [config](/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) |
-|       uniform 40        | short-side 320 | MViTv2-L\* | IN21K + K400 |   73.2   |   94.0   | [73.3](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [94.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crop | xx.xM  | [config](/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) |
+| frame sampling strategy |   resolution   |  backbone  |   pretrain   | top1 acc | top5 acc |       reference top1 acc       |       reference top1 acc        | testing protocol  | FLOPs | params |       config        |       ckpt        |
+| :---------------------: | :------------: | :--------: | :----------: | :------: | :------: | :----------------------------: | :-----------------------------: | :---------------: | :---: | :----: | :-----------------: | :---------------: |
+|       uniform 16        | short-side 320 | MViTv2-S\* |     K400     |   68.1   |   91.0   | [68.2](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [91.4](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crops |  64G  | 34.4M  | [config](/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-small-p244_u16_sthv2-rgb_20221021-65ecae7d.pth) |
+|       uniform 32        | short-side 320 | MViTv2-B\* |     K400     |   70.8   |   92.7   | [70.5](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [92.7](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crops | 225G  | 51.1M  | [config](/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-base-p244_u32_sthv2-rgb_20221021-d5de5da6.pth) |
+|       uniform 40        | short-side 320 | MViTv2-L\* | IN21K + K400 |   73.2   |   94.0   | [73.3](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [94.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crops | 2828G |  213M  | [config](/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-large-p244_u40_sthv2-rgb_20221021-61696e07.pth) |
 
-*Models with * are ported from the repo [SlowFast](https://github.com/facebookresearch/SlowFast/) and tested on our data. Currently, we only support the testing of X3D models, training will be available soon.*
+*Models with * are ported from the repo [SlowFast](https://github.com/facebookresearch/SlowFast/) and tested on our data. Currently, we only support the testing of MViT models, training will be available soon.*
 
 1. The values in columns named after "reference" are copied from paper
 2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available.
@@ -59,7 +59,7 @@ python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
 Example: test MViT model on Kinetics-400 dataset and dump the result to a pkl file.
 
 ```shell
-python tools/test.py configs/recognition/mvit/mvit-small_16x4x1_kinetics400-rgb.py \
+python tools/test.py configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py \
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
diff --git a/configs/recognition/mvit/metafile.yml b/configs/recognition/mvit/metafile.yml
new file mode 100644
index 0000000000..c5d7107482
--- /dev/null
+++ b/configs/recognition/mvit/metafile.yml
@@ -0,0 +1,115 @@
+Collections:
+- Name: MViT
+  README: configs/recognition/MViT/README.md
+  Paper:
+    URL: http://openaccess.thecvf.com//content/CVPR2022/papers/Li_MViTv2_Improved_Multiscale_Vision_Transformers_for_Classification_and_Detection_CVPR_2022_paper.pdf
+    Title: "MViTv2: Improved Multiscale Vision Transformers for Classification and Detection"
+
+Models:
+  - Name: mvit-small-p244_16x4x1_kinetics400-rgb
+    Config: configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py
+    In Collection: MViT
+    Metadata:
+      Architecture: MViT-small
+      Resolution: short-side 320
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md
+      Code: https://github.com/facebookresearch/SlowFast/
+    Results:
+    - Dataset: Kinetics-400
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 81.1
+        Top 5 Accuracy: 94.7
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-small-p244_16x4x1_kinetics400-rgb_20221021-9ebaaeed.pth
+
+  - Name: mvit-base-p244_32x3x1_kinetics400-rgb
+    Config: configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py
+    In Collection: MViT
+    Metadata:
+      Architecture: MViT-base
+      Resolution: short-side 320
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md
+      Code: https://github.com/facebookresearch/SlowFast/
+    Results:
+    - Dataset: Kinetics-400
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 81.1
+        Top 5 Accuracy: 94.7
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-base-p244_32x3x1_kinetics400-rgb_20221021-f392cd2d.pth
+
+  - Name: mvit-large-p244_40x3x1_kinetics400-rgb
+    Config: configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py
+    In Collection: MViT
+    Metadata:
+      Architecture: MViT-large
+      Resolution: short-side 446
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md
+      Code: https://github.com/facebookresearch/SlowFast/
+    Results:
+    - Dataset: Kinetics-400
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 81.1
+        Top 5 Accuracy: 94.7
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-large-p244_40x3x1_kinetics400-rgb_20221021-11fe1f97.pth
+
+  - Name: mvit-small-p244_u16_sthv2-rgb
+    Config: configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py
+    In Collection: MViT
+    Metadata:
+      Architecture: MViT-small
+      Resolution: short-side 320
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md
+      Code: https://github.com/facebookresearch/SlowFast/
+    Results:
+    - Dataset: SthV2
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 68.1
+        Top 5 Accuracy: 91.0
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-small-p244_u16_sthv2-rgb_20221021-65ecae7d.pth
+
+  - Name: mvit-base-p244_u32_sthv2-rgb
+    Config: configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py
+    In Collection: MViT
+    Metadata:
+      Architecture: MViT-small
+      Resolution: short-side 320
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md
+      Code: https://github.com/facebookresearch/SlowFast/
+    Results:
+    - Dataset: SthV2
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 70.8
+        Top 5 Accuracy: 92.7
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-base-p244_u32_sthv2-rgb_20221021-d5de5da6.pth
+
+  - Name: mvit-large-p244_u40_sthv2-rgb
+    Config: configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py
+    In Collection: MViT
+    Metadata:
+      Architecture: MViT-small
+      Resolution: short-side 446
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md
+      Code: https://github.com/facebookresearch/SlowFast/
+    Results:
+    - Dataset: SthV2
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 73.2
+        Top 5 Accuracy: 94.0
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-large-p244_u40_sthv2-rgb_20221021-61696e07.pth
diff --git a/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py b/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py
index 93b33a9dc9..b1e186f195 100644
--- a/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py
+++ b/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py
@@ -7,7 +7,19 @@
         arch='base',
         temporal_size=32,
         drop_path_rate=0.3,
-    ))
+    ),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        blending=dict(
+            type='RandomBatchAugment',
+            augments=[
+                dict(type='MixupBlending', alpha=0.8, num_classes=400),
+                dict(type='CutmixBlending', alpha=1, num_classes=400)
+            ]),
+        format_shape='NCTHW'),
+)
 
 # dataset settings
 dataset_type = 'VideoDataset'
diff --git a/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py b/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py
index c719396f29..944e17440d 100644
--- a/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py
+++ b/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py
@@ -8,6 +8,17 @@
         temporal_size=32,
         drop_path_rate=0.3,
     ),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        blending=dict(
+            type='RandomBatchAugment',
+            augments=[
+                dict(type='MixupBlending', alpha=0.8, num_classes=174),
+                dict(type='CutmixBlending', alpha=1, num_classes=174)
+            ]),
+        format_shape='NCTHW'),
     cls_head=dict(num_classes=174))
 
 # dataset settings
@@ -34,7 +45,6 @@
         op='RandAugment',
         magnitude=7,
         num_layers=4),
-    dict(type='Flip', flip_ratio=0.5),
     dict(type='RandomErasing', erase_prob=0.25, mode='rand'),
     dict(type='FormatShape', input_format='NCTHW'),
     dict(type='PackActionInputs')
diff --git a/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py b/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py
index 883d9f7ce5..8c93519914 100644
--- a/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py
+++ b/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py
@@ -9,6 +9,17 @@
         spatial_size=312,
         drop_path_rate=0.75,
     ),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        blending=dict(
+            type='RandomBatchAugment',
+            augments=[
+                dict(type='MixupBlending', alpha=0.8, num_classes=400),
+                dict(type='CutmixBlending', alpha=1, num_classes=400)
+            ]),
+        format_shape='NCTHW'),
     cls_head=dict(in_channels=1152),
     test_cfg=dict(max_testing_views=5))
 
diff --git a/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py b/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py
index c682571df6..9b47b27a10 100644
--- a/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py
+++ b/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py
@@ -9,6 +9,17 @@
         spatial_size=312,
         drop_path_rate=0.75,
     ),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        blending=dict(
+            type='RandomBatchAugment',
+            augments=[
+                dict(type='MixupBlending', alpha=0.8, num_classes=400),
+                dict(type='CutmixBlending', alpha=1, num_classes=400)
+            ]),
+        format_shape='NCTHW'),
     cls_head=dict(in_channels=1152, num_classes=174),
     test_cfg=dict(max_testing_views=5))
 
@@ -36,7 +47,6 @@
         op='RandAugment',
         magnitude=7,
         num_layers=4),
-    dict(type='Flip', flip_ratio=0.5),
     dict(type='RandomErasing', erase_prob=0.25, mode='rand'),
     dict(type='FormatShape', input_format='NCTHW'),
     dict(type='PackActionInputs')
diff --git a/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py b/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py
index 0df0b835fa..4da89b5a4a 100644
--- a/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py
+++ b/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py
@@ -2,6 +2,19 @@
     '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py'
 ]
 
+model = dict(
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        blending=dict(
+            type='RandomBatchAugment',
+            augments=[
+                dict(type='MixupBlending', alpha=0.8, num_classes=400),
+                dict(type='CutmixBlending', alpha=1, num_classes=400)
+            ]),
+        format_shape='NCTHW'), )
+
 # dataset settings
 dataset_type = 'VideoDataset'
 data_root = 'data/kinetics400/videos_train'
diff --git a/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py b/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py
index 7327df2e11..23f404db53 100644
--- a/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py
+++ b/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py
@@ -28,7 +28,6 @@
         op='RandAugment',
         magnitude=7,
         num_layers=4),
-    dict(type='Flip', flip_ratio=0.5),
     dict(type='RandomErasing', erase_prob=0.25, mode='rand'),
     dict(type='FormatShape', input_format='NCTHW'),
     dict(type='PackActionInputs')
@@ -105,7 +104,8 @@
 optim_wrapper = dict(
     type='AmpOptimWrapper',
     optimizer=dict(
-        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05))
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0))
 
 param_scheduler = [
     dict(
diff --git a/mmaction/datasets/transforms/loading.py b/mmaction/datasets/transforms/loading.py
index e756410dac..d050c40f4a 100644
--- a/mmaction/datasets/transforms/loading.py
+++ b/mmaction/datasets/transforms/loading.py
@@ -265,80 +265,6 @@ def __repr__(self):
         return repr_str
 
 
-@TRANSFORMS.register_module()
-class SampleFramesV2(SampleFrames):
-    """Sample frames from the video.
-
-    Required keys are "total_frames", "start_index" , added or modified keys
-    are "frame_inds", "frame_interval" and "num_clips".
-    Args:
-        clip_len (int): Frames of each sampled output clip.
-        frame_interval (int): Temporal interval of adjacent sampled frames.
-            Default: 1.
-        num_clips (int): Number of clips to be sampled. Default: 1.
-        temporal_jitter (bool): Whether to apply temporal jittering.
-            Default: False.
-        out_of_bound_opt (str): The way to deal with out of bounds frame
-            indexes. Available options are 'loop', 'repeat_last'.
-            Default: 'loop'.
-        test_mode (bool): Store True when building test or validation dataset.
-            Default: False.
-        start_index (None): This argument is deprecated and moved to dataset
-            class (``BaseDataset``, ``VideoDatset``, ``RawframeDataset``, etc),
-            see this: https://github.com/open-mmlab/mmaction2/pull/89.
-        keep_tail_frames (bool): Whether to keep tail frames when sampling.
-            Default: False.
-    """
-
-    def __init__(self,
-                 clip_len,
-                 frame_interval=1,
-                 num_clips=1,
-                 temporal_jitter=False,
-                 out_of_bound_opt='loop',
-                 test_mode=False,
-                 keep_tail_frames=False):
-        super().__init__(clip_len, frame_interval, num_clips, temporal_jitter,
-                         False, out_of_bound_opt, test_mode, keep_tail_frames)
-
-    def _get_train_clips(self, num_frames):
-        """Get clip offsets in train mode.
-
-        Args:
-            num_frames (int): Total number of frame in the video.
-        Returns:
-            np.ndarray: Sampled frame indices in train mode.
-        """
-        ori_clip_len = (self.clip_len - 1) * self.frame_interval + 1
-        max_offset = max(num_frames - ori_clip_len, 0)
-
-        num_segments = max(self.num_clips - 1, 1)
-        offset_between = max_offset / num_segments
-        clip_offsets = np.arange(self.num_clips) * offset_between
-        clip_offsets += np.random.uniform(0, offset_between, self.num_clips)
-        clip_offsets = np.round(clip_offsets).astype(np.int32)
-        return clip_offsets
-
-    def _get_test_clips(self, num_frames):
-        """Get clip offsets in test mode.
-
-        If the total number of frames is
-        not enough, it will return all zero indices.
-        Args:
-            num_frames (int): Total number of frame in the video.
-        Returns:
-            np.ndarray: Sampled frame indices in test mode.
-        """
-        ori_clip_len = (self.clip_len - 1) * self.frame_interval + 1
-        max_offset = max(num_frames - ori_clip_len, 0)
-
-        num_segments = max(self.num_clips - 1, 1)
-        offset_between = max_offset / float(num_segments)
-        clip_offsets = np.arange(self.num_clips) * offset_between
-        clip_offsets = np.round(clip_offsets).astype(np.int32)
-        return clip_offsets
-
-
 @TRANSFORMS.register_module()
 class UniformSampleFrames(BaseTransform):
     """Uniformly sample frames from the video.
diff --git a/mmaction/models/backbones/mvit.py b/mmaction/models/backbones/mvit.py
index 7974767cfc..1fb6b36290 100644
--- a/mmaction/models/backbones/mvit.py
+++ b/mmaction/models/backbones/mvit.py
@@ -7,8 +7,10 @@
 import torch.nn.functional as F
 from mmcv.cnn import build_activation_layer, build_norm_layer
 from mmcv.cnn.bricks import DropPath
+from mmengine.logging import MMLogger
 from mmengine.model import BaseModule, ModuleList
-from mmengine.model.weight_init import trunc_normal_
+from mmengine.model.weight_init import constant_init, trunc_normal_
+from mmengine.runner import load_checkpoint
 from mmengine.utils import to_3tuple
 
 from mmaction.registry import MODELS
@@ -332,7 +334,6 @@ def init_weights(self):
         if not self.rel_pos_zero_init:
             trunc_normal_(self.rel_pos_h, std=0.02)
             trunc_normal_(self.rel_pos_w, std=0.02)
-        if not self.rel_pos_zero_init:
             trunc_normal_(self.rel_pos_t, std=0.02)
 
     def forward(self, x, in_size):
@@ -672,10 +673,9 @@ def __init__(self,
                      stride=(2, 4, 4),
                      padding=(1, 3, 3)),
                  init_cfg=None):
-        if pretrained:
-            init_cfg = dict(type='Pretrained', checkpoint=pretrained)
         super().__init__(init_cfg=init_cfg)
 
+        self.pretrained = pretrained
         if isinstance(arch, str):
             arch = arch.lower()
             assert arch in set(self.arch_zoo), \
@@ -793,13 +793,27 @@ def __init__(self,
                     norm_layer = build_norm_layer(norm_cfg, out_dims)[1]
                     self.add_module(f'norm{stage_index}', norm_layer)
 
-    def init_weights(self):
-        super().init_weights()
+    def init_weights(self, pretrained: Optional[str] = None) -> None:
 
-        if (isinstance(self.init_cfg, dict)
-                and self.init_cfg['type'] == 'Pretrained'):
-            # Suppress default init if use pretrained model.
-            return
+        def _init_weights(m):
+            if isinstance(m, (nn.Linear, nn.Conv2d, nn.Conv3d)):
+                trunc_normal_(m.weight, std=0.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    constant_init(m.bias, 0.02)
+            elif isinstance(m, nn.LayerNorm):
+                constant_init(m.bias, 0.02)
+                constant_init(m.weight, 1.0)
+
+        if pretrained:
+            self.pretrained = pretrained
+        if isinstance(self.pretrained, str):
+            logger = MMLogger.get_current_instance()
+            logger.info(f'load model from: {self.pretrained}')
+            load_checkpoint(self, self.pretrained, strict=False, logger=logger)
+        elif self.pretrained is None:
+            self.apply(_init_weights)
+        else:
+            raise TypeError('pretrained must be a str or None')
 
         if self.use_abs_pos_embed:
             trunc_normal_(self.pos_embed, std=0.02)
diff --git a/mmaction/models/heads/__init__.py b/mmaction/models/heads/__init__.py
index 3395f96fe3..c803fc8561 100644
--- a/mmaction/models/heads/__init__.py
+++ b/mmaction/models/heads/__init__.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .base import BaseHead
 from .i3d_head import I3DHead
-from .mvit_head import MVitHead
+from .mvit_head import MViTHead
 from .slowfast_head import SlowFastHead
 from .stgcn_head import STGCNHead
 from .timesformer_head import TimeSformerHead
@@ -15,5 +15,5 @@
 __all__ = [
     'TSNHead', 'I3DHead', 'BaseHead', 'TSMHead', 'SlowFastHead', 'TPNHead',
     'X3DHead', 'TRNHead', 'TimeSformerHead', 'STGCNHead', 'TSNAudioHead',
-    'MVitHead'
+    'MViTHead'
 ]
diff --git a/mmaction/models/heads/mvit_head.py b/mmaction/models/heads/mvit_head.py
index eac4d30266..c5df34ea17 100644
--- a/mmaction/models/heads/mvit_head.py
+++ b/mmaction/models/heads/mvit_head.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import List, Tuple
 
-from mmengine.model.weight_init import trunc_normal_init
+from mmengine.model.weight_init import constant_init, trunc_normal_init
 from torch import Tensor, nn
 
 from mmaction.registry import MODELS
@@ -10,8 +10,8 @@
 
 
 @MODELS.register_module()
-class MVitHead(BaseHead):
-    """Classification head for TimeSformer.
+class MViTHead(BaseHead):
+    """Classification head for Multi-scale ViT.
 
     Args:
         num_classes (int): Number of classes to be classified.
@@ -42,7 +42,8 @@ def __init__(self,
 
     def init_weights(self) -> None:
         """Initiate the parameters from scratch."""
-        trunc_normal_init(self.fc_cls, std=self.init_std)
+        trunc_normal_init(self.fc_cls.weight, std=self.init_std)
+        constant_init(self.fc_cls.bias, 0.02)
 
     def pre_logits(self, feats: Tuple[List[Tensor]]) -> Tensor:
         """The process before the final classification head.
diff --git a/tests/models/backbones/test_mvit.py b/tests/models/backbones/test_mvit.py
index 4ebdbc26db..633cf73872 100644
--- a/tests/models/backbones/test_mvit.py
+++ b/tests/models/backbones/test_mvit.py
@@ -92,7 +92,7 @@ def test_init_weights(self):
         self.assertFalse(torch.allclose(model.pos_embed, torch.tensor(0.)))
 
     def test_forward(self):
-        imgs = torch.randn(1, 3, 16, 224, 224)
+        imgs = torch.randn(1, 3, 6, 64, 64)
 
         cfg = deepcopy(self.cfg)
         model = MViT(**cfg)
@@ -100,7 +100,7 @@ def test_forward(self):
         self.assertIsInstance(outs, tuple)
         self.assertEqual(len(outs), 1)
         patch_token, cls_token = outs[-1]
-        self.assertEqual(patch_token.shape, (1, 768, 8, 7, 7))
+        self.assertEqual(patch_token.shape, (1, 768, 3, 2, 2))
 
         # Test forward with multi out scales
         cfg = deepcopy(self.cfg)
@@ -113,13 +113,13 @@ def test_forward(self):
             stride = 2**stage
             patch_token, cls_token = out
             self.assertEqual(patch_token.shape,
-                             (1, 96 * stride, 8, 56 // stride, 56 // stride))
+                             (1, 96 * stride, 3, 16 // stride, 16 // stride))
             self.assertEqual(cls_token.shape, (1, 96 * stride))
 
         # Test forward with dynamic input size
-        imgs1 = torch.randn(1, 3, 16, 224, 224)
-        imgs2 = torch.randn(1, 3, 16, 256, 256)
-        imgs3 = torch.randn(1, 3, 16, 256, 309)
+        imgs1 = torch.randn(1, 3, 2, 64, 64)
+        imgs2 = torch.randn(1, 3, 2, 96, 96)
+        imgs3 = torch.randn(1, 3, 2, 96, 128)
         cfg = deepcopy(self.cfg)
         model = MViT(**cfg)
         for imgs in [imgs1, imgs2, imgs3]:
diff --git a/tests/models/heads/test_mvit_head.py b/tests/models/heads/test_mvit_head.py
new file mode 100644
index 0000000000..8f64f5bf06
--- /dev/null
+++ b/tests/models/heads/test_mvit_head.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+import torch.nn as nn
+
+from mmaction.models import MViTHead
+
+
+class TestMViTHead(TestCase):
+    DEFAULT_ARGS = dict(in_channels=768, num_classes=5)
+    fake_feats = ([torch.rand(4, 768, 3, 2, 2), torch.rand(4, 768)], )
+
+    def test_init(self):
+        head = MViTHead(**self.DEFAULT_ARGS)
+        head.init_weights()
+        self.assertEqual(head.dropout.p, head.dropout_ratio)
+        self.assertIsInstance(head.fc_cls, nn.Linear)
+        self.assertEqual(head.num_classes, 5)
+        self.assertEqual(head.dropout_ratio, 0.5)
+        self.assertEqual(head.in_channels, 768)
+        self.assertEqual(head.init_std, 0.02)
+
+    def test_pre_logits(self):
+        head = MViTHead(**self.DEFAULT_ARGS)
+        pre_logits = head.pre_logits(self.fake_feats)
+        self.assertIs(pre_logits, self.fake_feats[-1][1])
+
+    def test_forward(self):
+        head = MViTHead(**self.DEFAULT_ARGS)
+        cls_score = head(self.fake_feats)
+        self.assertEqual(cls_score.shape, (4, 5))

From c9304c3771d9aaedf94edca1fecc206309facd91 Mon Sep 17 00:00:00 2001
From: lilin <lilin@pjlab.org.cn>
Date: Tue, 22 Nov 2022 19:05:43 +0800
Subject: [PATCH 3/8] [doc] fix docstring

---
 mmaction/models/backbones/mvit.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mmaction/models/backbones/mvit.py b/mmaction/models/backbones/mvit.py
index 1fb6b36290..f52aa35688 100644
--- a/mmaction/models/backbones/mvit.py
+++ b/mmaction/models/backbones/mvit.py
@@ -607,7 +607,8 @@ class MViT(BaseModule):
         >>> register_all_modules()
         >>>
         >>> cfg = dict(type='MViT', arch='tiny', out_scales=[0, 1, 2, 3])
-        >>> model = model = MODELS.build(cfg)
+        >>> model = MODELS.build(cfg)
+        >>> model.init_weights()
         >>> inputs = torch.rand(1, 3, 16, 224, 224)
         >>> outputs = model(inputs)
         >>> for i, output in enumerate(outputs):

From 22f6600eb63054045503542e12fa9aa691b88553 Mon Sep 17 00:00:00 2001
From: lilin <lilin@pjlab.org.cn>
Date: Wed, 23 Nov 2022 14:52:49 +0800
Subject: [PATCH 4/8] add type hint

---
 mmaction/datasets/transforms/loading.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/mmaction/datasets/transforms/loading.py b/mmaction/datasets/transforms/loading.py
index d050c40f4a..f40c193d4a 100644
--- a/mmaction/datasets/transforms/loading.py
+++ b/mmaction/datasets/transforms/loading.py
@@ -290,11 +290,11 @@ class UniformSampleFrames(BaseTransform):
     """
 
     def __init__(self,
-                 clip_len,
-                 num_clips=1,
-                 test_mode=False,
-                 seed=255,
-                 out_of_bound_opt='loop'):
+                 clip_len: int,
+                 num_clips: int = 1,
+                 test_mode: bool = False,
+                 seed: int = 255,
+                 out_of_bound_opt: str = 'loop') -> None:
 
         self.clip_len = clip_len
         self.num_clips = num_clips
@@ -303,7 +303,7 @@ def __init__(self,
         self.out_of_bound_opt = out_of_bound_opt
         assert self.out_of_bound_opt in ['loop', 'repeat_frame']
 
-    def _get_train_clips(self, num_frames):
+    def _get_train_clips(self, num_frames: int):
         """Uniformly sample indices for training clips.
 
         Args:
@@ -333,7 +333,7 @@ def _get_train_clips(self, num_frames):
             inds = bst + offset
         return inds
 
-    def _get_test_clips(self, num_frames):
+    def _get_test_clips(self, num_frames: int):
         """Uniformly sample indices for testing clips.
 
         Args:
@@ -380,7 +380,7 @@ def _get_test_clips(self, num_frames):
             inds = np.concatenate(all_inds)
         return inds
 
-    def _get_repeat_sample_clips(self, num_frames):
+    def _get_repeat_sample_clips(self, num_frames: int) -> np.array:
         """Repeat sample when video is shorter than clip_len Modified from
         https://github.com/facebookresearch/SlowFast/blob/64ab
         cc90ccfdcbb11cf91d6e525bed60e92a8796/slowfast/datasets/ssv2.py#L159.
@@ -409,7 +409,7 @@ def _get_repeat_sample_clips(self, num_frames):
 
         return np.array(inds)
 
-    def transform(self, results):
+    def transform(self, results: dict):
         num_frames = results['total_frames']
 
         if self.out_of_bound_opt == 'loop':

From 08de55812f24a25922baa05e77d626dc29da8eb6 Mon Sep 17 00:00:00 2001
From: lilin <lilin@pjlab.org.cn>
Date: Wed, 23 Nov 2022 16:07:14 +0800
Subject: [PATCH 5/8] add type hint

---
 mmaction/datasets/transforms/loading.py |  17 ++-
 mmaction/models/backbones/mvit.py       | 170 ++++++++++++------------
 mmaction/models/heads/mvit_head.py      |   3 +
 3 files changed, 102 insertions(+), 88 deletions(-)

diff --git a/mmaction/datasets/transforms/loading.py b/mmaction/datasets/transforms/loading.py
index f40c193d4a..6ec61c0590 100644
--- a/mmaction/datasets/transforms/loading.py
+++ b/mmaction/datasets/transforms/loading.py
@@ -270,23 +270,32 @@ class UniformSampleFrames(BaseTransform):
     """Uniformly sample frames from the video.
 
     To sample an n-frame clip from the video. UniformSampleFrames basically
-    divide the video into n segments of equal length and randomly sample one
+    divides the video into n segments of equal length and randomly samples one
     frame from each segment. To make the testing results reproducible, a
     random seed is set during testing, to make the sampling results
     deterministic.
 
-    Required keys are "total_frames", "start_index" , added or modified keys
-    are "frame_inds", "clip_len", "frame_interval" and "num_clips".
+    Required keys:
+
+    - total_frames
+    - start_index
+
+    Added keys:
+
+    - frame_inds
+    - clip_len
+    - frame_interval
+    - num_clips
 
     Args:
         clip_len (int): Frames of each sampled output clip.
         num_clips (int): Number of clips to be sampled. Default: 1.
         test_mode (bool): Store True when building test or validation dataset.
             Default: False.
+        seed (int): The random seed used during test time. Default: 255.
         out_of_bound_opt (str): The way to deal with out of bounds frame
             indexes. Available options are 'loop', 'repeat_frame'.
             Default: 'loop'.
-        seed (int): The random seed used during test time. Default: 255.
     """
 
     def __init__(self,
diff --git a/mmaction/models/backbones/mvit.py b/mmaction/models/backbones/mvit.py
index f52aa35688..b3ce6e7427 100644
--- a/mmaction/models/backbones/mvit.py
+++ b/mmaction/models/backbones/mvit.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional, Sequence
+from typing import Dict, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import torch
@@ -17,11 +17,11 @@
 from ..utils.embed import PatchEmbed3D
 
 
-def resize_pos_embed(pos_embed,
-                     src_shape,
-                     dst_shape,
-                     mode='trilinear',
-                     num_extra_tokens=1):
+def resize_pos_embed(pos_embed: torch.Tensor,
+                     src_shape: Tuple[int],
+                     dst_shape: Tuple[int],
+                     mode: str = 'trilinear',
+                     num_extra_tokens: int = 1) -> torch.Tensor:
     """Resize pos_embed weights.
 
     Args:
@@ -63,7 +63,8 @@ def resize_pos_embed(pos_embed,
     return torch.cat((extra_tokens, dst_weight), dim=1)
 
 
-def resize_decomposed_rel_pos(rel_pos, q_size, k_size):
+def resize_decomposed_rel_pos(rel_pos: torch.Tensor, q_size: int,
+                              k_size: int) -> torch.Tensor:
     """Get relative positional embeddings according to the relative positions
     of query and key sizes.
 
@@ -100,14 +101,14 @@ def resize_decomposed_rel_pos(rel_pos, q_size, k_size):
     return resized[relative_coords.long()]
 
 
-def add_decomposed_rel_pos(attn,
-                           q,
-                           q_shape,
-                           k_shape,
-                           rel_pos_h,
-                           rel_pos_w,
-                           rel_pos_t,
-                           with_cls_token=False):
+def add_decomposed_rel_pos(attn: torch.Tensor,
+                           q: torch.Tensor,
+                           q_shape: Sequence[int],
+                           k_shape: Sequence[int],
+                           rel_pos_h: torch.Tensor,
+                           rel_pos_w: torch.Tensor,
+                           rel_pos_t: torch.Tensor,
+                           with_cls_token: bool = False) -> torch.Tensor:
     """Spatiotemporal Relative Positional Embeddings."""
     sp_idx = 1 if with_cls_token else 0
     B, num_heads, _, C = q.shape
@@ -155,11 +156,11 @@ class MLP(BaseModule):
     """
 
     def __init__(self,
-                 in_channels,
-                 hidden_channels=None,
-                 out_channels=None,
-                 act_cfg=dict(type='GELU'),
-                 init_cfg=None):
+                 in_channels: int,
+                 hidden_channels: Optional[int] = None,
+                 out_channels: Optional[int] = None,
+                 act_cfg: Dict = dict(type='GELU'),
+                 init_cfg: Optional[Dict] = None) -> None:
         super().__init__(init_cfg=init_cfg)
         out_channels = out_channels or in_channels
         hidden_channels = hidden_channels or in_channels
@@ -167,7 +168,7 @@ def __init__(self,
         self.act = build_activation_layer(act_cfg)
         self.fc2 = nn.Linear(hidden_channels, out_channels)
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.fc1(x)
         x = self.act(x)
         x = self.fc2(x)
@@ -176,9 +177,9 @@ def forward(self, x):
 
 def attention_pool(x: torch.Tensor,
                    pool: nn.Module,
-                   in_size: tuple,
+                   in_size: Tuple[int],
                    with_cls_token: bool = False,
-                   norm: Optional[nn.Module] = None):
+                   norm: Optional[nn.Module] = None) -> tuple:
     """Pooling the feature tokens.
 
     Args:
@@ -260,20 +261,20 @@ class MultiScaleAttention(BaseModule):
     """
 
     def __init__(self,
-                 in_dims,
-                 out_dims,
-                 num_heads,
-                 qkv_bias=True,
-                 norm_cfg=dict(type='LN'),
-                 pool_kernel=(3, 3, 3),
-                 stride_q=(1, 1, 1),
-                 stride_kv=(1, 1, 1),
-                 rel_pos_embed=True,
-                 residual_pooling=True,
-                 input_size=None,
-                 rel_pos_zero_init=False,
-                 with_cls_token=True,
-                 init_cfg=None):
+                 in_dims: int,
+                 out_dims: int,
+                 num_heads: int,
+                 qkv_bias: bool = True,
+                 norm_cfg: Dict = dict(type='LN'),
+                 pool_kernel: Tuple[int] = (3, 3, 3),
+                 stride_q: Tuple[int] = (1, 1, 1),
+                 stride_kv: Tuple[int] = (1, 1, 1),
+                 rel_pos_embed: bool = True,
+                 residual_pooling: bool = True,
+                 input_size: Optional[Tuple[int]] = None,
+                 rel_pos_zero_init: bool = False,
+                 with_cls_token: bool = True,
+                 init_cfg: Optional[dict] = None) -> None:
         super().__init__(init_cfg=init_cfg)
         self.num_heads = num_heads
         self.with_cls_token = with_cls_token
@@ -322,7 +323,7 @@ def build_pooling(stride):
             self.rel_pos_t = nn.Parameter(
                 torch.zeros(2 * input_size[0] - 1, head_dim))
 
-    def init_weights(self):
+    def init_weights(self) -> None:
         """Weight initialization."""
         super().init_weights()
 
@@ -336,7 +337,7 @@ def init_weights(self):
             trunc_normal_(self.rel_pos_w, std=0.02)
             trunc_normal_(self.rel_pos_t, std=0.02)
 
-    def forward(self, x, in_size):
+    def forward(self, x: torch.Tensor, in_size: Tuple[int]) -> tuple:
         """Forward the MultiScaleAttention."""
         B, N, _ = x.shape  # (B, H*W, C)
 
@@ -427,25 +428,25 @@ class MultiScaleBlock(BaseModule):
 
     def __init__(
         self,
-        in_dims,
-        out_dims,
-        num_heads,
-        mlp_ratio=4.0,
-        qkv_bias=True,
-        drop_path=0.0,
-        norm_cfg=dict(type='LN'),
-        act_cfg=dict(type='GELU'),
-        qkv_pool_kernel=(3, 3, 3),
-        stride_q=(1, 1, 1),
-        stride_kv=(1, 1, 1),
-        rel_pos_embed=True,
-        residual_pooling=True,
-        with_cls_token=True,
-        dim_mul_in_attention=True,
-        input_size=None,
-        rel_pos_zero_init=False,
-        init_cfg=None,
-    ):
+        in_dims: int,
+        out_dims: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        drop_path: float = 0.0,
+        norm_cfg: Dict = dict(type='LN'),
+        act_cfg: Dict = dict(type='GELU'),
+        qkv_pool_kernel: Tuple = (3, 3, 3),
+        stride_q: Tuple = (1, 1, 1),
+        stride_kv: Tuple = (1, 1, 1),
+        rel_pos_embed: bool = True,
+        residual_pooling: bool = True,
+        with_cls_token: bool = True,
+        dim_mul_in_attention: bool = True,
+        input_size: Optional[Tuple[int]] = None,
+        rel_pos_zero_init: bool = False,
+        init_cfg: Optional[Dict] = None,
+    ) -> None:
         super().__init__(init_cfg=init_cfg)
         self.with_cls_token = with_cls_token
         self.in_dims = in_dims
@@ -499,7 +500,7 @@ def __init__(
             self.pool_skip = None
             self.init_out_size = input_size
 
-    def forward(self, x, in_size):
+    def forward(self, x: torch.Tensor, in_size: Tuple[int]) -> tuple:
         x_norm = self.norm1(x)
         x_attn, out_size = self.attn(x_norm, in_size)
 
@@ -647,33 +648,33 @@ class MViT(BaseModule):
     num_extra_tokens = 1
 
     def __init__(self,
-                 arch='base',
-                 spatial_size=224,
-                 temporal_size=16,
-                 in_channels=3,
-                 pretrained=None,
-                 out_scales=-1,
-                 drop_path_rate=0.,
-                 use_abs_pos_embed=False,
-                 interpolate_mode='trilinear',
-                 pool_kernel=(3, 3, 3),
-                 dim_mul=2,
-                 head_mul=2,
-                 adaptive_kv_stride=(1, 8, 8),
-                 rel_pos_embed=True,
-                 residual_pooling=True,
-                 dim_mul_in_attention=True,
-                 with_cls_token=True,
-                 output_cls_token=True,
-                 rel_pos_zero_init=False,
-                 mlp_ratio=4.,
-                 qkv_bias=True,
-                 norm_cfg=dict(type='LN', eps=1e-6),
-                 patch_cfg=dict(
+                 arch: str = 'base',
+                 spatial_size: int = 224,
+                 temporal_size: int = 16,
+                 in_channels: int = 3,
+                 pretrained: Optional[str] = None,
+                 out_scales: Union[int, Sequence[int]] = -1,
+                 drop_path_rate: float = 0.,
+                 use_abs_pos_embed: bool = False,
+                 interpolate_mode: str = 'trilinear',
+                 pool_kernel: tuple = (3, 3, 3),
+                 dim_mul: int = 2,
+                 head_mul: int = 2,
+                 adaptive_kv_stride: tuple = (1, 8, 8),
+                 rel_pos_embed: bool = True,
+                 residual_pooling: bool = True,
+                 dim_mul_in_attention: bool = True,
+                 with_cls_token: bool = True,
+                 output_cls_token: bool = True,
+                 rel_pos_zero_init: bool = False,
+                 mlp_ratio: float = 4.,
+                 qkv_bias: bool = True,
+                 norm_cfg: Dict = dict(type='LN', eps=1e-6),
+                 patch_cfg: Dict = dict(
                      kernel_size=(3, 7, 7),
                      stride=(2, 4, 4),
                      padding=(1, 3, 3)),
-                 init_cfg=None):
+                 init_cfg: Optional[Dict] = None) -> None:
         super().__init__(init_cfg=init_cfg)
 
         self.pretrained = pretrained
@@ -819,7 +820,8 @@ def _init_weights(m):
         if self.use_abs_pos_embed:
             trunc_normal_(self.pos_embed, std=0.02)
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) ->\
+            Tuple[Union[torch.Tensor, List[torch.Tensor]]]:
         """Forward the MViT."""
         B = x.shape[0]
         x, patch_resolution = self.patch_embed(x)
diff --git a/mmaction/models/heads/mvit_head.py b/mmaction/models/heads/mvit_head.py
index c5df34ea17..3797bb616d 100644
--- a/mmaction/models/heads/mvit_head.py
+++ b/mmaction/models/heads/mvit_head.py
@@ -13,6 +13,9 @@
 class MViTHead(BaseHead):
     """Classification head for Multi-scale ViT.
 
+    A PyTorch implement of : `MViTv2: Improved Multiscale Vision Transformers
+    for Classification and Detection <https://arxiv.org/abs/2112.01526>`_
+
     Args:
         num_classes (int): Number of classes to be classified.
         in_channels (int): Number of channels in input feature.

From c7dea18436671e04feeb6627035e82a318a38ad9 Mon Sep 17 00:00:00 2001
From: lilin <lilin@pjlab.org.cn>
Date: Wed, 23 Nov 2022 16:48:11 +0800
Subject: [PATCH 6/8] fix init_cfg for mvit

---
 mmaction/models/backbones/mvit.py | 101 ++++++++++++++----------------
 1 file changed, 47 insertions(+), 54 deletions(-)

diff --git a/mmaction/models/backbones/mvit.py b/mmaction/models/backbones/mvit.py
index b3ce6e7427..95f917f136 100644
--- a/mmaction/models/backbones/mvit.py
+++ b/mmaction/models/backbones/mvit.py
@@ -7,10 +7,8 @@
 import torch.nn.functional as F
 from mmcv.cnn import build_activation_layer, build_norm_layer
 from mmcv.cnn.bricks import DropPath
-from mmengine.logging import MMLogger
 from mmengine.model import BaseModule, ModuleList
-from mmengine.model.weight_init import constant_init, trunc_normal_
-from mmengine.runner import load_checkpoint
+from mmengine.model.weight_init import trunc_normal_
 from mmengine.utils import to_3tuple
 
 from mmaction.registry import MODELS
@@ -160,7 +158,7 @@ def __init__(self,
                  hidden_channels: Optional[int] = None,
                  out_channels: Optional[int] = None,
                  act_cfg: Dict = dict(type='GELU'),
-                 init_cfg: Optional[Dict] = None) -> None:
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None:
         super().__init__(init_cfg=init_cfg)
         out_channels = out_channels or in_channels
         hidden_channels = hidden_channels or in_channels
@@ -598,8 +596,12 @@ class MViT(BaseModule):
             ``dict(kernel_size=(3, 7, 7),
                    stride=(2, 4, 4),
                    padding=(1, 3, 3))``.
-        init_cfg (dict, optional): The Config for initialization.
-            Defaults to None.
+        init_cfg (dict, optional): The Config for initialization. Defaults to
+            ``[
+            dict(type='TruncNormal', layer=['Conv2d', 'Conv3d'], std=0.02),
+            dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+            dict(type='Constant', layer='LayerNorm', val=1., bias=0.02),
+            ]``
 
     Examples:
         >>> import torch
@@ -647,37 +649,42 @@ class MViT(BaseModule):
     }
     num_extra_tokens = 1
 
-    def __init__(self,
-                 arch: str = 'base',
-                 spatial_size: int = 224,
-                 temporal_size: int = 16,
-                 in_channels: int = 3,
-                 pretrained: Optional[str] = None,
-                 out_scales: Union[int, Sequence[int]] = -1,
-                 drop_path_rate: float = 0.,
-                 use_abs_pos_embed: bool = False,
-                 interpolate_mode: str = 'trilinear',
-                 pool_kernel: tuple = (3, 3, 3),
-                 dim_mul: int = 2,
-                 head_mul: int = 2,
-                 adaptive_kv_stride: tuple = (1, 8, 8),
-                 rel_pos_embed: bool = True,
-                 residual_pooling: bool = True,
-                 dim_mul_in_attention: bool = True,
-                 with_cls_token: bool = True,
-                 output_cls_token: bool = True,
-                 rel_pos_zero_init: bool = False,
-                 mlp_ratio: float = 4.,
-                 qkv_bias: bool = True,
-                 norm_cfg: Dict = dict(type='LN', eps=1e-6),
-                 patch_cfg: Dict = dict(
-                     kernel_size=(3, 7, 7),
-                     stride=(2, 4, 4),
-                     padding=(1, 3, 3)),
-                 init_cfg: Optional[Dict] = None) -> None:
+    def __init__(
+        self,
+        arch: str = 'base',
+        spatial_size: int = 224,
+        temporal_size: int = 16,
+        in_channels: int = 3,
+        pretrained: Optional[str] = None,
+        out_scales: Union[int, Sequence[int]] = -1,
+        drop_path_rate: float = 0.,
+        use_abs_pos_embed: bool = False,
+        interpolate_mode: str = 'trilinear',
+        pool_kernel: tuple = (3, 3, 3),
+        dim_mul: int = 2,
+        head_mul: int = 2,
+        adaptive_kv_stride: tuple = (1, 8, 8),
+        rel_pos_embed: bool = True,
+        residual_pooling: bool = True,
+        dim_mul_in_attention: bool = True,
+        with_cls_token: bool = True,
+        output_cls_token: bool = True,
+        rel_pos_zero_init: bool = False,
+        mlp_ratio: float = 4.,
+        qkv_bias: bool = True,
+        norm_cfg: Dict = dict(type='LN', eps=1e-6),
+        patch_cfg: Dict = dict(
+            kernel_size=(3, 7, 7), stride=(2, 4, 4), padding=(1, 3, 3)),
+        init_cfg: Optional[Union[Dict, List[Dict]]] = [
+            dict(type='TruncNormal', layer=['Conv2d', 'Conv3d'], std=0.02),
+            dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+            dict(type='Constant', layer='LayerNorm', val=1., bias=0.02),
+        ]
+    ) -> None:
+        if pretrained:
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
         super().__init__(init_cfg=init_cfg)
 
-        self.pretrained = pretrained
         if isinstance(arch, str):
             arch = arch.lower()
             assert arch in set(self.arch_zoo), \
@@ -796,26 +803,12 @@ def __init__(self,
                     self.add_module(f'norm{stage_index}', norm_layer)
 
     def init_weights(self, pretrained: Optional[str] = None) -> None:
+        super().init_weights()
 
-        def _init_weights(m):
-            if isinstance(m, (nn.Linear, nn.Conv2d, nn.Conv3d)):
-                trunc_normal_(m.weight, std=0.02)
-                if isinstance(m, nn.Linear) and m.bias is not None:
-                    constant_init(m.bias, 0.02)
-            elif isinstance(m, nn.LayerNorm):
-                constant_init(m.bias, 0.02)
-                constant_init(m.weight, 1.0)
-
-        if pretrained:
-            self.pretrained = pretrained
-        if isinstance(self.pretrained, str):
-            logger = MMLogger.get_current_instance()
-            logger.info(f'load model from: {self.pretrained}')
-            load_checkpoint(self, self.pretrained, strict=False, logger=logger)
-        elif self.pretrained is None:
-            self.apply(_init_weights)
-        else:
-            raise TypeError('pretrained must be a str or None')
+        if (isinstance(self.init_cfg, dict)
+                and self.init_cfg['type'] == 'Pretrained'):
+            # Suppress default init if use pretrained model.
+            return
 
         if self.use_abs_pos_embed:
             trunc_normal_(self.pos_embed, std=0.02)

From 8f123b4d3124436e3e862d19d4a3796f480f6963 Mon Sep 17 00:00:00 2001
From: lilin <lilin@pjlab.org.cn>
Date: Wed, 23 Nov 2022 20:00:54 +0800
Subject: [PATCH 7/8] fix ut

---
 mmaction/models/recognizers/recognizer3d.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mmaction/models/recognizers/recognizer3d.py b/mmaction/models/recognizers/recognizer3d.py
index bb7e250157..81b86534ac 100644
--- a/mmaction/models/recognizers/recognizer3d.py
+++ b/mmaction/models/recognizers/recognizer3d.py
@@ -83,7 +83,10 @@ def recursively_cat(feats):
 
                     return tuple(out_feats)
 
-                x = recursively_cat(feats)
+                if isinstance(feats[0], tuple):
+                    x = recursively_cat(feats)
+                else:
+                    x = torch.cat(feats)
             else:
                 x = self.backbone(inputs)
                 if self.with_neck:

From 6f78baceaab0d0b3eb45a650987a92b575b2e461 Mon Sep 17 00:00:00 2001
From: lilin <lilin@pjlab.org.cn>
Date: Tue, 29 Nov 2022 15:22:49 +0800
Subject: [PATCH 8/8] split uniform sample

---
 .../mvit/mvit-base-p244_u32_sthv2-rgb.py      |  17 +--
 .../mvit/mvit-large-p244_u40_sthv2-rgb.py     |  17 +--
 .../mvit/mvit-small-p244_u16_sthv2-rgb.py     |  17 +--
 mmaction/datasets/transforms/__init__.py      |  14 +-
 mmaction/datasets/transforms/loading.py       | 130 ++---------------
 mmaction/datasets/transforms/pose_loading.py  | 135 ++++++++++++++++++
 .../datasets/transforms/test_pose_loading.py  |  88 +++++++++++-
 tests/datasets/transforms/test_sampling.py    |  87 +----------
 8 files changed, 254 insertions(+), 251 deletions(-)

diff --git a/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py b/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py
index 944e17440d..c954b60b54 100644
--- a/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py
+++ b/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py
@@ -32,10 +32,7 @@
 file_client_args = dict(io_backend='disk')
 train_pipeline = [
     dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSampleFrames',
-        clip_len=32,
-        out_of_bound_opt='repeat_frame'),
+    dict(type='UniformSample', clip_len=32),
     dict(type='DecordDecode'),
     dict(type='Resize', scale=(-1, 256)),
     dict(type='RandomResizedCrop'),
@@ -51,11 +48,7 @@
 ]
 val_pipeline = [
     dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSampleFrames',
-        clip_len=32,
-        out_of_bound_opt='repeat_frame',
-        test_mode=True),
+    dict(type='UniformSample', clip_len=32, test_mode=True),
     dict(type='DecordDecode'),
     dict(type='Resize', scale=(-1, 256)),
     dict(type='CenterCrop', crop_size=224),
@@ -64,11 +57,7 @@
 ]
 test_pipeline = [
     dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSampleFrames',
-        clip_len=32,
-        out_of_bound_opt='repeat_frame',
-        test_mode=True),
+    dict(type='UniformSample', clip_len=32, test_mode=True),
     dict(type='DecordDecode'),
     dict(type='Resize', scale=(-1, 224)),
     dict(type='ThreeCrop', crop_size=224),
diff --git a/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py b/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py
index 9b47b27a10..b3fde41a78 100644
--- a/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py
+++ b/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py
@@ -34,10 +34,7 @@
 file_client_args = dict(io_backend='disk')
 train_pipeline = [
     dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSampleFrames',
-        clip_len=40,
-        out_of_bound_opt='repeat_frame'),
+    dict(type='UniformSample', clip_len=40),
     dict(type='DecordDecode'),
     dict(type='Resize', scale=(-1, 256)),
     dict(type='RandomResizedCrop'),
@@ -53,11 +50,7 @@
 ]
 val_pipeline = [
     dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSampleFrames',
-        clip_len=40,
-        out_of_bound_opt='repeat_frame',
-        test_mode=True),
+    dict(type='UniformSample', clip_len=40, test_mode=True),
     dict(type='DecordDecode'),
     dict(type='Resize', scale=(-1, 256)),
     dict(type='CenterCrop', crop_size=224),
@@ -66,11 +59,7 @@
 ]
 test_pipeline = [
     dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSampleFrames',
-        clip_len=40,
-        out_of_bound_opt='repeat_frame',
-        test_mode=True),
+    dict(type='UniformSample', clip_len=40, test_mode=True),
     dict(type='DecordDecode'),
     dict(type='Resize', scale=(-1, 224)),
     dict(type='ThreeCrop', crop_size=224),
diff --git a/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py b/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py
index 23f404db53..08934b9a5e 100644
--- a/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py
+++ b/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py
@@ -15,10 +15,7 @@
 file_client_args = dict(io_backend='disk')
 train_pipeline = [
     dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSampleFrames',
-        clip_len=16,
-        out_of_bound_opt='repeat_frame'),
+    dict(type='UniformSample', clip_len=16),
     dict(type='DecordDecode'),
     dict(type='Resize', scale=(-1, 256)),
     dict(type='RandomResizedCrop'),
@@ -34,11 +31,7 @@
 ]
 val_pipeline = [
     dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSampleFrames',
-        clip_len=16,
-        out_of_bound_opt='repeat_frame',
-        test_mode=True),
+    dict(type='UniformSample', clip_len=16, test_mode=True),
     dict(type='DecordDecode'),
     dict(type='Resize', scale=(-1, 256)),
     dict(type='CenterCrop', crop_size=224),
@@ -47,11 +40,7 @@
 ]
 test_pipeline = [
     dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSampleFrames',
-        clip_len=16,
-        out_of_bound_opt='repeat_frame',
-        test_mode=True),
+    dict(type='UniformSample', clip_len=16, test_mode=True),
     dict(type='DecordDecode'),
     dict(type='Resize', scale=(-1, 224)),
     dict(type='ThreeCrop', crop_size=224),
diff --git a/mmaction/datasets/transforms/__init__.py b/mmaction/datasets/transforms/__init__.py
index 7aaaee894d..09e0111e4c 100644
--- a/mmaction/datasets/transforms/__init__.py
+++ b/mmaction/datasets/transforms/__init__.py
@@ -10,9 +10,9 @@
                       LoadProposals, OpenCVDecode, OpenCVInit, PIMSDecode,
                       PIMSInit, PyAVDecode, PyAVDecodeMotionVector, PyAVInit,
                       RawFrameDecode, SampleAVAFrames, SampleFrames,
-                      UniformSampleFrames, UntrimmedSampleFrames)
+                      UniformSample, UntrimmedSampleFrames)
 from .pose_loading import (GeneratePoseTarget, LoadKineticsPose,
-                           PaddingWithLoop, PoseDecode)
+                           PaddingWithLoop, PoseDecode, UniformSampleFrames)
 from .processing import (AudioAmplify, CenterCrop, ColorJitter, Flip, Fuse,
                          MelSpectrogram, MultiScaleCrop, PoseCompact,
                          RandomCrop, RandomRescale, RandomResizedCrop, Resize,
@@ -30,9 +30,9 @@
     'AudioAmplify', 'MelSpectrogram', 'AudioDecode', 'FormatAudioShape',
     'LoadAudioFeature', 'AudioFeatureSelector', 'AudioDecodeInit',
     'ImageDecode', 'BuildPseudoClip', 'RandomRescale', 'PIMSDecode',
-    'PyAVDecodeMotionVector', 'UniformSampleFrames', 'PoseDecode',
-    'LoadKineticsPose', 'GeneratePoseTarget', 'PIMSInit', 'FormatGCNInput',
-    'PaddingWithLoop', 'ArrayDecode', 'JointToBone', 'PackActionInputs',
-    'PackLocalizationInputs', 'ImgAug', 'TorchVisionWrapper',
-    'PytorchVideoWrapper', 'PoseCompact'
+    'PyAVDecodeMotionVector', 'UniformSample', 'UniformSampleFrames',
+    'PoseDecode', 'LoadKineticsPose', 'GeneratePoseTarget', 'PIMSInit',
+    'FormatGCNInput', 'PaddingWithLoop', 'ArrayDecode', 'JointToBone',
+    'PackActionInputs', 'PackLocalizationInputs', 'ImgAug',
+    'TorchVisionWrapper', 'PytorchVideoWrapper', 'PoseCompact'
 ]
diff --git a/mmaction/datasets/transforms/loading.py b/mmaction/datasets/transforms/loading.py
index 6ec61c0590..e0b5ce75a6 100644
--- a/mmaction/datasets/transforms/loading.py
+++ b/mmaction/datasets/transforms/loading.py
@@ -266,14 +266,15 @@ def __repr__(self):
 
 
 @TRANSFORMS.register_module()
-class UniformSampleFrames(BaseTransform):
-    """Uniformly sample frames from the video.
+class UniformSample(BaseTransform):
+    """Uniformly sample frames from the video. Currently used for Something-
+    Something V2 dataset. Modified from
+    https://github.com/facebookresearch/SlowFast/blob/64a
+    bcc90ccfdcbb11cf91d6e525bed60e92a8796/slowfast/datasets/ssv2.py#L159.
 
     To sample an n-frame clip from the video. UniformSampleFrames basically
     divides the video into n segments of equal length and randomly samples one
-    frame from each segment. To make the testing results reproducible, a
-    random seed is set during testing, to make the sampling results
-    deterministic.
+    frame from each segment.
 
     Required keys:
 
@@ -292,113 +293,23 @@ class UniformSampleFrames(BaseTransform):
         num_clips (int): Number of clips to be sampled. Default: 1.
         test_mode (bool): Store True when building test or validation dataset.
             Default: False.
-        seed (int): The random seed used during test time. Default: 255.
-        out_of_bound_opt (str): The way to deal with out of bounds frame
-            indexes. Available options are 'loop', 'repeat_frame'.
-            Default: 'loop'.
     """
 
     def __init__(self,
                  clip_len: int,
                  num_clips: int = 1,
-                 test_mode: bool = False,
-                 seed: int = 255,
-                 out_of_bound_opt: str = 'loop') -> None:
+                 test_mode: bool = False) -> None:
 
         self.clip_len = clip_len
         self.num_clips = num_clips
         self.test_mode = test_mode
-        self.seed = seed
-        self.out_of_bound_opt = out_of_bound_opt
-        assert self.out_of_bound_opt in ['loop', 'repeat_frame']
-
-    def _get_train_clips(self, num_frames: int):
-        """Uniformly sample indices for training clips.
-
-        Args:
-            num_frames (int): The number of frames.
-        """
-
-        assert self.num_clips == 1
-        if num_frames < self.clip_len:
-            start = np.random.randint(0, num_frames)
-            inds = np.arange(start, start + self.clip_len)
-        elif self.clip_len <= num_frames < 2 * self.clip_len:
-            basic = np.arange(self.clip_len)
-            inds = np.random.choice(
-                self.clip_len + 1, num_frames - self.clip_len, replace=False)
-            offset = np.zeros(self.clip_len + 1, dtype=np.int32)
-            offset[inds] = 1
-            offset = np.cumsum(offset)
-            inds = basic + offset[:-1]
-        else:
-            bids = np.array([
-                i * num_frames // self.clip_len
-                for i in range(self.clip_len + 1)
-            ])
-            bsize = np.diff(bids)
-            bst = bids[:self.clip_len]
-            offset = np.random.randint(bsize)
-            inds = bst + offset
-        return inds
-
-    def _get_test_clips(self, num_frames: int):
-        """Uniformly sample indices for testing clips.
 
-        Args:
-            num_frames (int): The number of frames.
-        """
-
-        np.random.seed(self.seed)
-        if num_frames < self.clip_len:
-            # Then we use a simple strategy
-            if num_frames < self.num_clips:
-                start_inds = list(range(self.num_clips))
-            else:
-                start_inds = [
-                    i * num_frames // self.num_clips
-                    for i in range(self.num_clips)
-                ]
-            inds = np.concatenate(
-                [np.arange(i, i + self.clip_len) for i in start_inds])
-        elif self.clip_len <= num_frames < self.clip_len * 2:
-            all_inds = []
-            for i in range(self.num_clips):
-                basic = np.arange(self.clip_len)
-                inds = np.random.choice(
-                    self.clip_len + 1,
-                    num_frames - self.clip_len,
-                    replace=False)
-                offset = np.zeros(self.clip_len + 1, dtype=np.int32)
-                offset[inds] = 1
-                offset = np.cumsum(offset)
-                inds = basic + offset[:-1]
-                all_inds.append(inds)
-            inds = np.concatenate(all_inds)
-        else:
-            bids = np.array([
-                i * num_frames // self.clip_len
-                for i in range(self.clip_len + 1)
-            ])
-            bsize = np.diff(bids)
-            bst = bids[:self.clip_len]
-            all_inds = []
-            for i in range(self.num_clips):
-                offset = np.random.randint(bsize)
-                all_inds.append(bst + offset)
-            inds = np.concatenate(all_inds)
-        return inds
-
-    def _get_repeat_sample_clips(self, num_frames: int) -> np.array:
-        """Repeat sample when video is shorter than clip_len Modified from
-        https://github.com/facebookresearch/SlowFast/blob/64ab
-        cc90ccfdcbb11cf91d6e525bed60e92a8796/slowfast/datasets/ssv2.py#L159.
-
-        When video frames is shorter than target clip len, this strategy would
-        repeat sample frame, rather than loop sample in 'loop' mode.
-        In test mode, this strategy would sample the middle frame of each
-        segment, rather than set a random seed, and therefore only support
-        sample 1 clip.
+    def _get_sample_clips(self, num_frames: int) -> np.array:
+        """When video frames is shorter than target clip len, this strategy
+        would repeat sample frame, rather than loop sample in 'loop' mode. In
+        test mode, this strategy would sample the middle frame of each segment,
+        rather than set a random seed, and therefore only support sample 1
+        clip.
 
         Args:
             num_frames (int): Total number of frame in the video.
@@ -421,17 +332,7 @@ def _get_repeat_sample_clips(self, num_frames: int) -> np.array:
     def transform(self, results: dict):
         num_frames = results['total_frames']
 
-        if self.out_of_bound_opt == 'loop':
-            if self.test_mode:
-                inds = self._get_test_clips(num_frames)
-            else:
-                inds = self._get_train_clips(num_frames)
-            inds = np.mod(inds, num_frames)
-        elif self.out_of_bound_opt == 'repeat_frame':
-            inds = self._get_repeat_sample_clips(num_frames)
-        else:
-            raise ValueError('Illegal out_of_bound option.')
-
+        inds = self._get_sample_clips(num_frames)
         start_index = results['start_index']
         inds = inds + start_index
 
@@ -445,8 +346,7 @@ def __repr__(self):
         repr_str = (f'{self.__class__.__name__}('
                     f'clip_len={self.clip_len}, '
                     f'num_clips={self.num_clips}, '
-                    f'test_mode={self.test_mode}, '
-                    f'seed={self.seed})')
+                    f'test_mode={self.test_mode}')
         return repr_str
 
 
diff --git a/mmaction/datasets/transforms/pose_loading.py b/mmaction/datasets/transforms/pose_loading.py
index 592850334f..58748eacb6 100644
--- a/mmaction/datasets/transforms/pose_loading.py
+++ b/mmaction/datasets/transforms/pose_loading.py
@@ -11,6 +11,141 @@
 from .processing import Flip
 
 
+@TRANSFORMS.register_module()
+class UniformSampleFrames(BaseTransform):
+    """Uniformly sample frames from the video.
+
+    To sample an n-frame clip from the video. UniformSampleFrames basically
+    divide the video into n segments of equal length and randomly sample one
+    frame from each segment. To make the testing results reproducible, a
+    random seed is set during testing, to make the sampling results
+    deterministic.
+
+    Required keys are ``'total_frames'``, ``'start_index'`` , added or
+    modified keys are ``'frame_inds'``, ``'clip_len'``,
+    ``'frame_interval'`` and ``'num_clips'``.
+
+    Args:
+        clip_len (int): Frames of each sampled output clip.
+        num_clips (int): Number of clips to be sampled. Defaults to 1.
+        test_mode (bool): Store True when building test or validation dataset.
+            Defaults to False.
+        seed (int): The random seed used during test time. Defaults to 255.
+    """
+
+    def __init__(self, clip_len, num_clips=1, test_mode=False, seed=255):
+
+        self.clip_len = clip_len
+        self.num_clips = num_clips
+        self.test_mode = test_mode
+        self.seed = seed
+
+    def _get_train_clips(self, num_frames, clip_len):
+        """Uniformly sample indices for training clips.
+
+        Args:
+            num_frames (int): The number of frames.
+            clip_len (int): The length of the clip.
+        """
+
+        assert self.num_clips == 1
+        if num_frames < clip_len:
+            start = np.random.randint(0, num_frames)
+            inds = np.arange(start, start + clip_len)
+        elif clip_len <= num_frames < 2 * clip_len:
+            basic = np.arange(clip_len)
+            inds = np.random.choice(
+                clip_len + 1, num_frames - clip_len, replace=False)
+            offset = np.zeros(clip_len + 1, dtype=np.int32)
+            offset[inds] = 1
+            offset = np.cumsum(offset)
+            inds = basic + offset[:-1]
+        else:
+            bids = np.array(
+                [i * num_frames // clip_len for i in range(clip_len + 1)])
+            bsize = np.diff(bids)
+            bst = bids[:clip_len]
+            offset = np.random.randint(bsize)
+            inds = bst + offset
+        return inds
+
+    def _get_test_clips(self, num_frames, clip_len):
+        """Uniformly sample indices for testing clips.
+
+        Args:
+            num_frames (int): The number of frames.
+            clip_len (int): The length of the clip.
+        """
+
+        np.random.seed(self.seed)
+        if num_frames < clip_len:
+            # Then we use a simple strategy
+            if num_frames < self.num_clips:
+                start_inds = list(range(self.num_clips))
+            else:
+                start_inds = [
+                    i * num_frames // self.num_clips
+                    for i in range(self.num_clips)
+                ]
+            inds = np.concatenate(
+                [np.arange(i, i + clip_len) for i in start_inds])
+        elif clip_len <= num_frames < clip_len * 2:
+            all_inds = []
+            for i in range(self.num_clips):
+                basic = np.arange(clip_len)
+                inds = np.random.choice(
+                    clip_len + 1, num_frames - clip_len, replace=False)
+                offset = np.zeros(clip_len + 1, dtype=np.int32)
+                offset[inds] = 1
+                offset = np.cumsum(offset)
+                inds = basic + offset[:-1]
+                all_inds.append(inds)
+            inds = np.concatenate(all_inds)
+        else:
+            bids = np.array(
+                [i * num_frames // clip_len for i in range(clip_len + 1)])
+            bsize = np.diff(bids)
+            bst = bids[:clip_len]
+            all_inds = []
+            for i in range(self.num_clips):
+                offset = np.random.randint(bsize)
+                all_inds.append(bst + offset)
+            inds = np.concatenate(all_inds)
+        return inds
+
+    def transform(self, results):
+        """Perform the SampleFrames loading.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        num_frames = results['total_frames']
+
+        if self.test_mode:
+            inds = self._get_test_clips(num_frames, self.clip_len)
+        else:
+            inds = self._get_train_clips(num_frames, self.clip_len)
+
+        inds = np.mod(inds, num_frames)
+        start_index = results['start_index']
+        inds = inds + start_index
+
+        results['frame_inds'] = inds.astype(np.int32)
+        results['clip_len'] = self.clip_len
+        results['frame_interval'] = None
+        results['num_clips'] = self.num_clips
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'clip_len={self.clip_len}, '
+                    f'num_clips={self.num_clips}, '
+                    f'test_mode={self.test_mode}, '
+                    f'seed={self.seed})')
+        return repr_str
+
+
 @TRANSFORMS.register_module()
 class PoseDecode(BaseTransform):
     """Load and decode pose with given indices.
diff --git a/tests/datasets/transforms/test_pose_loading.py b/tests/datasets/transforms/test_pose_loading.py
index fd7568798f..eeb2dad84c 100644
--- a/tests/datasets/transforms/test_pose_loading.py
+++ b/tests/datasets/transforms/test_pose_loading.py
@@ -10,11 +10,97 @@
 from numpy.testing import assert_array_almost_equal, assert_array_equal
 
 from mmaction.datasets.transforms import (GeneratePoseTarget, LoadKineticsPose,
-                                          PaddingWithLoop, PoseDecode)
+                                          PaddingWithLoop, PoseDecode,
+                                          UniformSampleFrames)
 
 
 class TestPoseLoading:
 
+    @staticmethod
+    def test_uniform_sample_frames():
+        results = dict(total_frames=64, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=1, test_mode=True, seed=0)
+
+        assert str(sampling) == ('UniformSampleFrames(clip_len=8, '
+                                 'num_clips=1, test_mode=True, seed=0)')
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert_array_equal(sampling_results['frame_inds'],
+                           np.array([4, 15, 21, 24, 35, 43, 51, 63]))
+
+        results = dict(total_frames=15, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=1, test_mode=True, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert_array_equal(sampling_results['frame_inds'],
+                           np.array([0, 2, 4, 6, 8, 9, 11, 13]))
+
+        results = dict(total_frames=7, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=1, test_mode=True, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert_array_equal(sampling_results['frame_inds'],
+                           np.array([0, 1, 2, 3, 4, 5, 6, 0]))
+
+        results = dict(total_frames=7, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=8, test_mode=True, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 8
+        assert len(sampling_results['frame_inds']) == 64
+
+        results = dict(total_frames=64, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=4, test_mode=True, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 4
+        assert_array_equal(
+            sampling_results['frame_inds'],
+            np.array([
+                4, 15, 21, 24, 35, 43, 51, 63, 1, 11, 21, 26, 36, 47, 54, 56,
+                0, 12, 18, 25, 38, 47, 55, 62, 0, 9, 21, 25, 37, 40, 49, 60
+            ]))
+
+        results = dict(total_frames=64, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=1, test_mode=False, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert len(sampling_results['frame_inds']) == 8
+
+        results = dict(total_frames=7, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=1, test_mode=False, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert len(sampling_results['frame_inds']) == 8
+
+        results = dict(total_frames=15, start_index=0)
+        sampling = UniformSampleFrames(
+            clip_len=8, num_clips=1, test_mode=False, seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == 8
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert len(sampling_results['frame_inds']) == 8
+
     @staticmethod
     def test_pose_decode():
         kp = np.random.random([1, 16, 17, 2])
diff --git a/tests/datasets/transforms/test_sampling.py b/tests/datasets/transforms/test_sampling.py
index 9450682315..f4a5e457bd 100644
--- a/tests/datasets/transforms/test_sampling.py
+++ b/tests/datasets/transforms/test_sampling.py
@@ -9,8 +9,7 @@
 
 from mmaction.datasets.transforms import (AudioFeatureSelector,
                                           DenseSampleFrames, SampleAVAFrames,
-                                          SampleFrames, UniformSampleFrames,
-                                          UntrimmedSampleFrames)
+                                          SampleFrames, UntrimmedSampleFrames)
 
 
 class BaseTestLoading:
@@ -402,90 +401,6 @@ def check_monotonous(arr):
         assert np.max(sample_frames_results['frame_inds']) <= 40
         assert np.min(sample_frames_results['frame_inds']) >= 1
 
-    def test_uniform_sample_frames(self):
-        results = dict(total_frames=64, start_index=0)
-        sampling = UniformSampleFrames(
-            clip_len=8, num_clips=1, test_mode=True, seed=0)
-
-        assert str(sampling) == ('UniformSampleFrames(clip_len=8, '
-                                 'num_clips=1, test_mode=True, seed=0)')
-        sampling_results = sampling(results)
-        assert sampling_results['clip_len'] == 8
-        assert sampling_results['frame_interval'] is None
-        assert sampling_results['num_clips'] == 1
-        assert_array_equal(sampling_results['frame_inds'],
-                           np.array([4, 15, 21, 24, 35, 43, 51, 63]))
-
-        results = dict(total_frames=15, start_index=0)
-        sampling = UniformSampleFrames(
-            clip_len=8, num_clips=1, test_mode=True, seed=0)
-        sampling_results = sampling(results)
-        assert sampling_results['clip_len'] == 8
-        assert sampling_results['frame_interval'] is None
-        assert sampling_results['num_clips'] == 1
-        assert_array_equal(sampling_results['frame_inds'],
-                           np.array([0, 2, 4, 6, 8, 9, 11, 13]))
-
-        results = dict(total_frames=7, start_index=0)
-        sampling = UniformSampleFrames(
-            clip_len=8, num_clips=1, test_mode=True, seed=0)
-        sampling_results = sampling(results)
-        assert sampling_results['clip_len'] == 8
-        assert sampling_results['frame_interval'] is None
-        assert sampling_results['num_clips'] == 1
-        assert_array_equal(sampling_results['frame_inds'],
-                           np.array([0, 1, 2, 3, 4, 5, 6, 0]))
-
-        results = dict(total_frames=7, start_index=0)
-        sampling = UniformSampleFrames(
-            clip_len=8, num_clips=8, test_mode=True, seed=0)
-        sampling_results = sampling(results)
-        assert sampling_results['clip_len'] == 8
-        assert sampling_results['frame_interval'] is None
-        assert sampling_results['num_clips'] == 8
-        assert len(sampling_results['frame_inds']) == 64
-
-        results = dict(total_frames=64, start_index=0)
-        sampling = UniformSampleFrames(
-            clip_len=8, num_clips=4, test_mode=True, seed=0)
-        sampling_results = sampling(results)
-        assert sampling_results['clip_len'] == 8
-        assert sampling_results['frame_interval'] is None
-        assert sampling_results['num_clips'] == 4
-        assert_array_equal(
-            sampling_results['frame_inds'],
-            np.array([
-                4, 15, 21, 24, 35, 43, 51, 63, 1, 11, 21, 26, 36, 47, 54, 56,
-                0, 12, 18, 25, 38, 47, 55, 62, 0, 9, 21, 25, 37, 40, 49, 60
-            ]))
-
-        results = dict(total_frames=64, start_index=0)
-        sampling = UniformSampleFrames(
-            clip_len=8, num_clips=1, test_mode=False, seed=0)
-        sampling_results = sampling(results)
-        assert sampling_results['clip_len'] == 8
-        assert sampling_results['frame_interval'] is None
-        assert sampling_results['num_clips'] == 1
-        assert len(sampling_results['frame_inds']) == 8
-
-        results = dict(total_frames=7, start_index=0)
-        sampling = UniformSampleFrames(
-            clip_len=8, num_clips=1, test_mode=False, seed=0)
-        sampling_results = sampling(results)
-        assert sampling_results['clip_len'] == 8
-        assert sampling_results['frame_interval'] is None
-        assert sampling_results['num_clips'] == 1
-        assert len(sampling_results['frame_inds']) == 8
-
-        results = dict(total_frames=15, start_index=0)
-        sampling = UniformSampleFrames(
-            clip_len=8, num_clips=1, test_mode=False, seed=0)
-        sampling_results = sampling(results)
-        assert sampling_results['clip_len'] == 8
-        assert sampling_results['frame_interval'] is None
-        assert sampling_results['num_clips'] == 1
-        assert len(sampling_results['frame_inds']) == 8
-
     def test_dense_sample_frames(self):
         target_keys = [
             'frame_inds', 'clip_len', 'frame_interval', 'num_clips',