From b8f6341fbb6507b029c29a3b6c3926b62e0671b3 Mon Sep 17 00:00:00 2001 From: lilin Date: Tue, 11 Oct 2022 16:35:51 +0800 Subject: [PATCH 1/8] [feat] support MViT --- configs/_base_/models/mvit_small.py | 20 + configs/recognition/mvit/README.md | 77 ++ .../mvit-base-p244_32x3x1_kinetics400-rgb.py | 138 +++ .../mvit/mvit-base-p244_u32_sthv2-rgb.py | 141 +++ .../mvit-large-p244_40x3x1_kinetics400-rgb.py | 141 +++ .../mvit/mvit-large-p244_u40_sthv2-rgb.py | 143 +++ .../mvit-small-p244_16x4x1_kinetics400-rgb.py | 132 +++ .../mvit/mvit-small-p244_u16_sthv2-rgb.py | 135 +++ mmaction/datasets/transforms/__init__.py | 4 +- mmaction/datasets/transforms/loading.py | 250 ++++++ mmaction/datasets/transforms/pose_loading.py | 135 --- mmaction/datasets/transforms/processing.py | 170 ++++ mmaction/models/backbones/__init__.py | 4 +- mmaction/models/backbones/mvit.py | 850 ++++++++++++++++++ mmaction/models/heads/__init__.py | 4 +- mmaction/models/heads/mvit_head.py | 71 ++ mmaction/models/recognizers/recognizer3d.py | 25 +- mmaction/models/utils/__init__.py | 7 +- mmaction/models/utils/blending_utils.py | 68 ++ mmaction/models/utils/embed.py | 234 +++++ .../datasets/transforms/test_pose_loading.py | 88 +- tests/datasets/transforms/test_sampling.py | 87 +- tests/models/backbones/test_mvit.py | 134 +++ tests/models/utils/test_blending_utils.py | 42 +- 24 files changed, 2860 insertions(+), 240 deletions(-) create mode 100644 configs/_base_/models/mvit_small.py create mode 100644 configs/recognition/mvit/README.md create mode 100644 configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py create mode 100644 configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py create mode 100644 configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py create mode 100644 configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py create mode 100644 configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py create mode 100644 configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py create mode 100644 mmaction/models/backbones/mvit.py create mode 100644 mmaction/models/heads/mvit_head.py create mode 100644 mmaction/models/utils/embed.py create mode 100644 tests/models/backbones/test_mvit.py diff --git a/configs/_base_/models/mvit_small.py b/configs/_base_/models/mvit_small.py new file mode 100644 index 0000000000..727df37c38 --- /dev/null +++ b/configs/_base_/models/mvit_small.py @@ -0,0 +1,20 @@ +model = dict( + type='Recognizer3D', + backbone=dict(type='MViT', arch='small', drop_path_rate=0.2), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + blending=dict( + type='RandomBatchAugment', + augments=[ + dict(type='MixupBlending', alpha=0.8, num_classes=400), + dict(type='CutmixBlending', alpha=1, num_classes=400) + ]), + format_shape='NCTHW'), + cls_head=dict( + type='MVitHead', + in_channels=768, + num_classes=400, + label_smooth_eps=0.1, + average_clips='prob')) diff --git a/configs/recognition/mvit/README.md b/configs/recognition/mvit/README.md new file mode 100644 index 0000000000..fdc694a128 --- /dev/null +++ b/configs/recognition/mvit/README.md @@ -0,0 +1,77 @@ +# MViT V2 + +> [MViTv2: Improved Multiscale Vision Transformers for Classification and Detection](http://openaccess.thecvf.com//content/CVPR2022/papers/Li_MViTv2_Improved_Multiscale_Vision_Transformers_for_Classification_and_Detection_CVPR_2022_paper.pdf) + + + +## Abstract + + + +In this paper, we study Multiscale Vision Transformers (MViTv2) as a unified architecture for image and video +classification, as well as object detection. We present an improved version of MViT that incorporates +decomposed relative positional embeddings and residual pooling connections. We instantiate this architecture +in five sizes and evaluate it for ImageNet classification, COCO detection and Kinetics video recognition where +it outperforms prior work. We further compare MViTv2s' pooling attention to window attention mechanisms where +it outperforms the latter in accuracy/compute. Without bells-and-whistles, MViTv2 has state-of-the-art +performance in 3 domains: 88.8% accuracy on ImageNet classification, 58.7 boxAP on COCO object detection as +well as 86.1% on Kinetics-400 video classification. + + + +
+ +
+ +## Results and models + +### Kinetics-400 + +| frame sampling strategy | resolution | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top1 acc | testing protocol | params | config | ckpt | +| :---------------------: | :------------: | :--------: | :----------: | :------: | :------: | :------------------------------: | :------------------------------: | :--------------: | :----: | :------------------: | :-----------------: | +| 16x4x1 | short-side 320 | MViTv2-S\* | From scratch | 81.1 | 94.7 | [81.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [94.6](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 1 crop | xx.xM | [config](/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) | +| 32x3x1 | short-side 320 | MViTv2-B\* | From scratch | 82.6 | 95.8 | [82.9](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [95.7](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 1 crop | xx.xM | [config](/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) | +| 40x3x1 | short-side 320 | MViTv2-L\* | From scratch | 85.4 | 96.2 | [86.1](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [97.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 3 crop | xx.xM | [config](/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) | + +### Something-Something V2 + +| frame sampling strategy | resolution | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top1 acc | testing protocol | params | config | ckpt | +| :---------------------: | :------------: | :--------: | :----------: | :------: | :------: | :------------------------------: | :------------------------------: | :--------------: | :----: | :------------------: | :-----------------: | +| uniform 16 | short-side 320 | MViTv2-S\* | K400 | 68.1 | 91.0 | [68.2](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [91.4](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crop | xx.xM | [config](/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) | +| uniform 32 | short-side 320 | MViTv2-B\* | K400 | 70.8 | 92.7 | [70.5](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [92.7](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crop | xx.xM | [config](/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) | +| uniform 40 | short-side 320 | MViTv2-L\* | IN21K + K400 | 73.2 | 94.0 | [73.3](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [94.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crop | xx.xM | [config](/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) | + +*Models with * are ported from the repo [SlowFast](https://github.com/facebookresearch/SlowFast/) and tested on our data. Currently, we only support the testing of X3D models, training will be available soon.* + +1. The values in columns named after "reference" are copied from paper +2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. + +For more details on data preparation, you can refer to [Kinetics400](/tools/data/kinetics/README.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test MViT model on Kinetics-400 dataset and dump the result to a pkl file. + +```shell +python tools/test.py configs/recognition/mvit/mvit-small_16x4x1_kinetics400-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). + +## Citation + +```bibtex +@inproceedings{li2021improved, + title={MViTv2: Improved multiscale vision transformers for classification and detection}, + author={Li, Yanghao and Wu, Chao-Yuan and Fan, Haoqi and Mangalam, Karttikeya and Xiong, Bo and Malik, Jitendra and Feichtenhofer, Christoph}, + booktitle={CVPR}, + year={2022} +} +``` diff --git a/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py b/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py new file mode 100644 index 0000000000..93b33a9dc9 --- /dev/null +++ b/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py @@ -0,0 +1,138 @@ +_base_ = [ + '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + arch='base', + temporal_size=32, + drop_path_rate=0.3, + )) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=32, frame_interval=3, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='RandomErasing', erase_prob=0.25, mode='rand'), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=3, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=32, + frame_interval=3, + num_clips=5, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=30, val_begin=1, val_interval=3) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict( + type='AdamW', lr=1.6e-3, betas=(0.9, 0.999), weight_decay=0.05)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=30, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=200, + eta_min=0, + by_epoch=True, + begin=0, + end=200, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py b/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py new file mode 100644 index 0000000000..c719396f29 --- /dev/null +++ b/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py @@ -0,0 +1,141 @@ +_base_ = [ + '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + arch='base', + temporal_size=32, + drop_path_rate=0.3, + ), + cls_head=dict(num_classes=174)) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/sthv2/videos' +data_root_val = 'data/sthv2/videos' +ann_file_train = 'data/sthv2/sthv2_train_list_videos.txt' +ann_file_val = 'data/sthv2/sthv2_val_list_videos.txt' +ann_file_test = 'data/sthv2/sthv2_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSampleFrames', + clip_len=32, + out_of_bound_opt='repeat_frame'), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='Flip', flip_ratio=0.5), + dict(type='RandomErasing', erase_prob=0.25, mode='rand'), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSampleFrames', + clip_len=32, + out_of_bound_opt='repeat_frame', + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSampleFrames', + clip_len=32, + out_of_bound_opt='repeat_frame', + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=3) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +base_lr = 1.6e-3 +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=30, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=70, + eta_min=base_lr / 100, + by_epoch=True, + begin=30, + end=100, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py b/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py new file mode 100644 index 0000000000..883d9f7ce5 --- /dev/null +++ b/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py @@ -0,0 +1,141 @@ +_base_ = [ + '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + arch='large', + temporal_size=40, + spatial_size=312, + drop_path_rate=0.75, + ), + cls_head=dict(in_channels=1152), + test_cfg=dict(max_testing_views=5)) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=40, frame_interval=3, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 356)), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(312, 312), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='RandomErasing', erase_prob=0.25, mode='rand'), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=40, + frame_interval=3, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 356)), + dict(type='CenterCrop', crop_size=312), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=40, + frame_interval=3, + num_clips=5, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 312)), + dict(type='ThreeCrop', crop_size=312), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=30, val_begin=1, val_interval=3) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict( + type='AdamW', lr=1.6e-3, betas=(0.9, 0.999), weight_decay=0.05)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=30, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=200, + eta_min=0, + by_epoch=True, + begin=0, + end=200, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=512) diff --git a/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py b/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py new file mode 100644 index 0000000000..c682571df6 --- /dev/null +++ b/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py @@ -0,0 +1,143 @@ +_base_ = [ + '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + arch='large', + temporal_size=40, + spatial_size=312, + drop_path_rate=0.75, + ), + cls_head=dict(in_channels=1152, num_classes=174), + test_cfg=dict(max_testing_views=5)) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/sthv2/videos' +data_root_val = 'data/sthv2/videos' +ann_file_train = 'data/sthv2/sthv2_train_list_videos.txt' +ann_file_val = 'data/sthv2/sthv2_val_list_videos.txt' +ann_file_test = 'data/sthv2/sthv2_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSampleFrames', + clip_len=40, + out_of_bound_opt='repeat_frame'), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='Flip', flip_ratio=0.5), + dict(type='RandomErasing', erase_prob=0.25, mode='rand'), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSampleFrames', + clip_len=40, + out_of_bound_opt='repeat_frame', + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSampleFrames', + clip_len=40, + out_of_bound_opt='repeat_frame', + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=3) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +base_lr = 1.6e-3 +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=30, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=70, + eta_min=base_lr / 100, + by_epoch=True, + begin=30, + end=100, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=10)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py b/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py new file mode 100644 index 0000000000..0df0b835fa --- /dev/null +++ b/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py @@ -0,0 +1,132 @@ +_base_ = [ + '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=16, frame_interval=4, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='RandomErasing', erase_prob=0.25, mode='rand'), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=16, + frame_interval=4, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=16, + frame_interval=4, + num_clips=5, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=200, val_begin=1, val_interval=3) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +base_lr = 1.6e-3 +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=30, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=200, + eta_min=base_lr / 100, + by_epoch=True, + begin=30, + end=200, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=512) diff --git a/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py b/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py new file mode 100644 index 0000000000..7327df2e11 --- /dev/null +++ b/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py @@ -0,0 +1,135 @@ +_base_ = [ + '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py' +] + +model = dict(cls_head=dict(num_classes=174)) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/sthv2/videos' +data_root_val = 'data/sthv2/videos' +ann_file_train = 'data/sthv2/sthv2_train_list_videos.txt' +ann_file_val = 'data/sthv2/sthv2_val_list_videos.txt' +ann_file_test = 'data/sthv2/sthv2_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSampleFrames', + clip_len=16, + out_of_bound_opt='repeat_frame'), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='Flip', flip_ratio=0.5), + dict(type='RandomErasing', erase_prob=0.25, mode='rand'), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSampleFrames', + clip_len=16, + out_of_bound_opt='repeat_frame', + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSampleFrames', + clip_len=16, + out_of_bound_opt='repeat_frame', + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=3) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +base_lr = 1.6e-3 +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=30, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=100, + eta_min=base_lr / 100, + by_epoch=True, + begin=30, + end=100, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/mmaction/datasets/transforms/__init__.py b/mmaction/datasets/transforms/__init__.py index f21e6d01b9..7aaaee894d 100644 --- a/mmaction/datasets/transforms/__init__.py +++ b/mmaction/datasets/transforms/__init__.py @@ -10,9 +10,9 @@ LoadProposals, OpenCVDecode, OpenCVInit, PIMSDecode, PIMSInit, PyAVDecode, PyAVDecodeMotionVector, PyAVInit, RawFrameDecode, SampleAVAFrames, SampleFrames, - UntrimmedSampleFrames) + UniformSampleFrames, UntrimmedSampleFrames) from .pose_loading import (GeneratePoseTarget, LoadKineticsPose, - PaddingWithLoop, PoseDecode, UniformSampleFrames) + PaddingWithLoop, PoseDecode) from .processing import (AudioAmplify, CenterCrop, ColorJitter, Flip, Fuse, MelSpectrogram, MultiScaleCrop, PoseCompact, RandomCrop, RandomRescale, RandomResizedCrop, Resize, diff --git a/mmaction/datasets/transforms/loading.py b/mmaction/datasets/transforms/loading.py index ceb761d638..e756410dac 100644 --- a/mmaction/datasets/transforms/loading.py +++ b/mmaction/datasets/transforms/loading.py @@ -265,6 +265,256 @@ def __repr__(self): return repr_str +@TRANSFORMS.register_module() +class SampleFramesV2(SampleFrames): + """Sample frames from the video. + + Required keys are "total_frames", "start_index" , added or modified keys + are "frame_inds", "frame_interval" and "num_clips". + Args: + clip_len (int): Frames of each sampled output clip. + frame_interval (int): Temporal interval of adjacent sampled frames. + Default: 1. + num_clips (int): Number of clips to be sampled. Default: 1. + temporal_jitter (bool): Whether to apply temporal jittering. + Default: False. + out_of_bound_opt (str): The way to deal with out of bounds frame + indexes. Available options are 'loop', 'repeat_last'. + Default: 'loop'. + test_mode (bool): Store True when building test or validation dataset. + Default: False. + start_index (None): This argument is deprecated and moved to dataset + class (``BaseDataset``, ``VideoDatset``, ``RawframeDataset``, etc), + see this: https://github.com/open-mmlab/mmaction2/pull/89. + keep_tail_frames (bool): Whether to keep tail frames when sampling. + Default: False. + """ + + def __init__(self, + clip_len, + frame_interval=1, + num_clips=1, + temporal_jitter=False, + out_of_bound_opt='loop', + test_mode=False, + keep_tail_frames=False): + super().__init__(clip_len, frame_interval, num_clips, temporal_jitter, + False, out_of_bound_opt, test_mode, keep_tail_frames) + + def _get_train_clips(self, num_frames): + """Get clip offsets in train mode. + + Args: + num_frames (int): Total number of frame in the video. + Returns: + np.ndarray: Sampled frame indices in train mode. + """ + ori_clip_len = (self.clip_len - 1) * self.frame_interval + 1 + max_offset = max(num_frames - ori_clip_len, 0) + + num_segments = max(self.num_clips - 1, 1) + offset_between = max_offset / num_segments + clip_offsets = np.arange(self.num_clips) * offset_between + clip_offsets += np.random.uniform(0, offset_between, self.num_clips) + clip_offsets = np.round(clip_offsets).astype(np.int32) + return clip_offsets + + def _get_test_clips(self, num_frames): + """Get clip offsets in test mode. + + If the total number of frames is + not enough, it will return all zero indices. + Args: + num_frames (int): Total number of frame in the video. + Returns: + np.ndarray: Sampled frame indices in test mode. + """ + ori_clip_len = (self.clip_len - 1) * self.frame_interval + 1 + max_offset = max(num_frames - ori_clip_len, 0) + + num_segments = max(self.num_clips - 1, 1) + offset_between = max_offset / float(num_segments) + clip_offsets = np.arange(self.num_clips) * offset_between + clip_offsets = np.round(clip_offsets).astype(np.int32) + return clip_offsets + + +@TRANSFORMS.register_module() +class UniformSampleFrames(BaseTransform): + """Uniformly sample frames from the video. + + To sample an n-frame clip from the video. UniformSampleFrames basically + divide the video into n segments of equal length and randomly sample one + frame from each segment. To make the testing results reproducible, a + random seed is set during testing, to make the sampling results + deterministic. + + Required keys are "total_frames", "start_index" , added or modified keys + are "frame_inds", "clip_len", "frame_interval" and "num_clips". + + Args: + clip_len (int): Frames of each sampled output clip. + num_clips (int): Number of clips to be sampled. Default: 1. + test_mode (bool): Store True when building test or validation dataset. + Default: False. + out_of_bound_opt (str): The way to deal with out of bounds frame + indexes. Available options are 'loop', 'repeat_frame'. + Default: 'loop'. + seed (int): The random seed used during test time. Default: 255. + """ + + def __init__(self, + clip_len, + num_clips=1, + test_mode=False, + seed=255, + out_of_bound_opt='loop'): + + self.clip_len = clip_len + self.num_clips = num_clips + self.test_mode = test_mode + self.seed = seed + self.out_of_bound_opt = out_of_bound_opt + assert self.out_of_bound_opt in ['loop', 'repeat_frame'] + + def _get_train_clips(self, num_frames): + """Uniformly sample indices for training clips. + + Args: + num_frames (int): The number of frames. + """ + + assert self.num_clips == 1 + if num_frames < self.clip_len: + start = np.random.randint(0, num_frames) + inds = np.arange(start, start + self.clip_len) + elif self.clip_len <= num_frames < 2 * self.clip_len: + basic = np.arange(self.clip_len) + inds = np.random.choice( + self.clip_len + 1, num_frames - self.clip_len, replace=False) + offset = np.zeros(self.clip_len + 1, dtype=np.int32) + offset[inds] = 1 + offset = np.cumsum(offset) + inds = basic + offset[:-1] + else: + bids = np.array([ + i * num_frames // self.clip_len + for i in range(self.clip_len + 1) + ]) + bsize = np.diff(bids) + bst = bids[:self.clip_len] + offset = np.random.randint(bsize) + inds = bst + offset + return inds + + def _get_test_clips(self, num_frames): + """Uniformly sample indices for testing clips. + + Args: + num_frames (int): The number of frames. + """ + + np.random.seed(self.seed) + if num_frames < self.clip_len: + # Then we use a simple strategy + if num_frames < self.num_clips: + start_inds = list(range(self.num_clips)) + else: + start_inds = [ + i * num_frames // self.num_clips + for i in range(self.num_clips) + ] + inds = np.concatenate( + [np.arange(i, i + self.clip_len) for i in start_inds]) + elif self.clip_len <= num_frames < self.clip_len * 2: + all_inds = [] + for i in range(self.num_clips): + basic = np.arange(self.clip_len) + inds = np.random.choice( + self.clip_len + 1, + num_frames - self.clip_len, + replace=False) + offset = np.zeros(self.clip_len + 1, dtype=np.int32) + offset[inds] = 1 + offset = np.cumsum(offset) + inds = basic + offset[:-1] + all_inds.append(inds) + inds = np.concatenate(all_inds) + else: + bids = np.array([ + i * num_frames // self.clip_len + for i in range(self.clip_len + 1) + ]) + bsize = np.diff(bids) + bst = bids[:self.clip_len] + all_inds = [] + for i in range(self.num_clips): + offset = np.random.randint(bsize) + all_inds.append(bst + offset) + inds = np.concatenate(all_inds) + return inds + + def _get_repeat_sample_clips(self, num_frames): + """Repeat sample when video is shorter than clip_len Modified from + https://github.com/facebookresearch/SlowFast/blob/64ab + cc90ccfdcbb11cf91d6e525bed60e92a8796/slowfast/datasets/ssv2.py#L159. + + When video frames is shorter than target clip len, this strategy would + repeat sample frame, rather than loop sample in 'loop' mode. + In test mode, this strategy would sample the middle frame of each + segment, rather than set a random seed, and therefore only support + sample 1 clip. + + Args: + num_frames (int): Total number of frame in the video. + Returns: + seq (list): the indexes of frames of sampled from the video. + """ + assert self.num_clips == 1 + seg_size = float(num_frames - 1) / self.clip_len + inds = [] + for i in range(self.clip_len): + start = int(np.round(seg_size * i)) + end = int(np.round(seg_size * (i + 1))) + if not self.test_mode: + inds.append(np.random.randint(start, end + 1)) + else: + inds.append((start + end) // 2) + + return np.array(inds) + + def transform(self, results): + num_frames = results['total_frames'] + + if self.out_of_bound_opt == 'loop': + if self.test_mode: + inds = self._get_test_clips(num_frames) + else: + inds = self._get_train_clips(num_frames) + inds = np.mod(inds, num_frames) + elif self.out_of_bound_opt == 'repeat_frame': + inds = self._get_repeat_sample_clips(num_frames) + else: + raise ValueError('Illegal out_of_bound option.') + + start_index = results['start_index'] + inds = inds + start_index + + results['frame_inds'] = inds.astype(np.int32) + results['clip_len'] = self.clip_len + results['frame_interval'] = None + results['num_clips'] = self.num_clips + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'clip_len={self.clip_len}, ' + f'num_clips={self.num_clips}, ' + f'test_mode={self.test_mode}, ' + f'seed={self.seed})') + return repr_str + + @TRANSFORMS.register_module() class UntrimmedSampleFrames(BaseTransform): """Sample frames from the untrimmed video. diff --git a/mmaction/datasets/transforms/pose_loading.py b/mmaction/datasets/transforms/pose_loading.py index 58748eacb6..592850334f 100644 --- a/mmaction/datasets/transforms/pose_loading.py +++ b/mmaction/datasets/transforms/pose_loading.py @@ -11,141 +11,6 @@ from .processing import Flip -@TRANSFORMS.register_module() -class UniformSampleFrames(BaseTransform): - """Uniformly sample frames from the video. - - To sample an n-frame clip from the video. UniformSampleFrames basically - divide the video into n segments of equal length and randomly sample one - frame from each segment. To make the testing results reproducible, a - random seed is set during testing, to make the sampling results - deterministic. - - Required keys are ``'total_frames'``, ``'start_index'`` , added or - modified keys are ``'frame_inds'``, ``'clip_len'``, - ``'frame_interval'`` and ``'num_clips'``. - - Args: - clip_len (int): Frames of each sampled output clip. - num_clips (int): Number of clips to be sampled. Defaults to 1. - test_mode (bool): Store True when building test or validation dataset. - Defaults to False. - seed (int): The random seed used during test time. Defaults to 255. - """ - - def __init__(self, clip_len, num_clips=1, test_mode=False, seed=255): - - self.clip_len = clip_len - self.num_clips = num_clips - self.test_mode = test_mode - self.seed = seed - - def _get_train_clips(self, num_frames, clip_len): - """Uniformly sample indices for training clips. - - Args: - num_frames (int): The number of frames. - clip_len (int): The length of the clip. - """ - - assert self.num_clips == 1 - if num_frames < clip_len: - start = np.random.randint(0, num_frames) - inds = np.arange(start, start + clip_len) - elif clip_len <= num_frames < 2 * clip_len: - basic = np.arange(clip_len) - inds = np.random.choice( - clip_len + 1, num_frames - clip_len, replace=False) - offset = np.zeros(clip_len + 1, dtype=np.int32) - offset[inds] = 1 - offset = np.cumsum(offset) - inds = basic + offset[:-1] - else: - bids = np.array( - [i * num_frames // clip_len for i in range(clip_len + 1)]) - bsize = np.diff(bids) - bst = bids[:clip_len] - offset = np.random.randint(bsize) - inds = bst + offset - return inds - - def _get_test_clips(self, num_frames, clip_len): - """Uniformly sample indices for testing clips. - - Args: - num_frames (int): The number of frames. - clip_len (int): The length of the clip. - """ - - np.random.seed(self.seed) - if num_frames < clip_len: - # Then we use a simple strategy - if num_frames < self.num_clips: - start_inds = list(range(self.num_clips)) - else: - start_inds = [ - i * num_frames // self.num_clips - for i in range(self.num_clips) - ] - inds = np.concatenate( - [np.arange(i, i + clip_len) for i in start_inds]) - elif clip_len <= num_frames < clip_len * 2: - all_inds = [] - for i in range(self.num_clips): - basic = np.arange(clip_len) - inds = np.random.choice( - clip_len + 1, num_frames - clip_len, replace=False) - offset = np.zeros(clip_len + 1, dtype=np.int32) - offset[inds] = 1 - offset = np.cumsum(offset) - inds = basic + offset[:-1] - all_inds.append(inds) - inds = np.concatenate(all_inds) - else: - bids = np.array( - [i * num_frames // clip_len for i in range(clip_len + 1)]) - bsize = np.diff(bids) - bst = bids[:clip_len] - all_inds = [] - for i in range(self.num_clips): - offset = np.random.randint(bsize) - all_inds.append(bst + offset) - inds = np.concatenate(all_inds) - return inds - - def transform(self, results): - """Perform the SampleFrames loading. - - Args: - results (dict): The resulting dict to be modified and passed - to the next transform in pipeline. - """ - num_frames = results['total_frames'] - - if self.test_mode: - inds = self._get_test_clips(num_frames, self.clip_len) - else: - inds = self._get_train_clips(num_frames, self.clip_len) - - inds = np.mod(inds, num_frames) - start_index = results['start_index'] - inds = inds + start_index - - results['frame_inds'] = inds.astype(np.int32) - results['clip_len'] = self.clip_len - results['frame_interval'] = None - results['num_clips'] = self.num_clips - return results - - def __repr__(self): - repr_str = (f'{self.__class__.__name__}(' - f'clip_len={self.clip_len}, ' - f'num_clips={self.num_clips}, ' - f'test_mode={self.test_mode}, ' - f'seed={self.seed})') - return repr_str - - @TRANSFORMS.register_module() class PoseDecode(BaseTransform): """Load and decode pose with given indices. diff --git a/mmaction/datasets/transforms/processing.py b/mmaction/datasets/transforms/processing.py index 6ea381030f..d34bc93327 100644 --- a/mmaction/datasets/transforms/processing.py +++ b/mmaction/datasets/transforms/processing.py @@ -1,12 +1,15 @@ # Copyright (c) OpenMMLab. All rights reserved. import random import warnings +from numbers import Number +from typing import Sequence import cv2 import mmcv import mmengine import numpy as np from mmcv.transforms import BaseTransform +from mmcv.transforms.utils import cache_randomness from torch.nn.modules.utils import _pair from mmaction.registry import TRANSFORMS @@ -1491,3 +1494,170 @@ def __repr__(self): f'n_mels={self.n_mels}, ' f'fixed_length={self.fixed_length})') return repr_str + + +@TRANSFORMS.register_module() +class RandomErasing(BaseTransform): + """Randomly selects a rectangle region in an image and erase pixels. + basically refer mmcls. + + **Required Keys:** + + - img + + **Modified Keys:** + + - img + + Args: + erase_prob (float): Probability that image will be randomly erased. + Default: 0.5 + min_area_ratio (float): Minimum erased area / input image area + Default: 0.02 + max_area_ratio (float): Maximum erased area / input image area + Default: 1/3 + aspect_range (sequence | float): Aspect ratio range of erased area. + if float, it will be converted to (aspect_ratio, 1/aspect_ratio) + Default: (3/10, 10/3) + mode (str): Fill method in erased area, can be: + + - const (default): All pixels are assign with the same value. + - rand: each pixel is assigned with a random value in [0, 255] + + fill_color (sequence | Number): Base color filled in erased area. + Defaults to (128, 128, 128). + fill_std (sequence | Number, optional): If set and ``mode`` is 'rand', + fill erased area with random color from normal distribution + (mean=fill_color, std=fill_std); If not set, fill erased area with + random color from uniform distribution (0~255). Defaults to None. + + Note: + See `Random Erasing Data Augmentation + `_ + + This paper provided 4 modes: RE-R, RE-M, RE-0, RE-255, and use RE-M as + default. The config of these 4 modes are: + + - RE-R: RandomErasing(mode='rand') + - RE-M: RandomErasing(mode='const', fill_color=(123.67, 116.3, 103.5)) + - RE-0: RandomErasing(mode='const', fill_color=0) + - RE-255: RandomErasing(mode='const', fill_color=255) + """ + + def __init__(self, + erase_prob=0.5, + min_area_ratio=0.02, + max_area_ratio=1 / 3, + aspect_range=(3 / 10, 10 / 3), + mode='const', + fill_color=(128, 128, 128), + fill_std=None): + assert isinstance(erase_prob, float) and 0. <= erase_prob <= 1. + assert isinstance(min_area_ratio, float) and 0. <= min_area_ratio <= 1. + assert isinstance(max_area_ratio, float) and 0. <= max_area_ratio <= 1. + assert min_area_ratio <= max_area_ratio, \ + 'min_area_ratio should be smaller than max_area_ratio' + if isinstance(aspect_range, float): + aspect_range = min(aspect_range, 1 / aspect_range) + aspect_range = (aspect_range, 1 / aspect_range) + assert isinstance(aspect_range, Sequence) and len(aspect_range) == 2 \ + and all(isinstance(x, float) for x in aspect_range), \ + 'aspect_range should be a float or Sequence with two float.' + assert all(x > 0 for x in aspect_range), \ + 'aspect_range should be positive.' + assert aspect_range[0] <= aspect_range[1], \ + 'In aspect_range (min, max), min should be smaller than max.' + assert mode in ['const', 'rand'], \ + 'Please select `mode` from ["const", "rand"].' + if isinstance(fill_color, Number): + fill_color = [fill_color] * 3 + assert isinstance(fill_color, Sequence) and len(fill_color) == 3 \ + and all(isinstance(x, Number) for x in fill_color), \ + 'fill_color should be a float or Sequence with three int.' + if fill_std is not None: + if isinstance(fill_std, Number): + fill_std = [fill_std] * 3 + assert isinstance(fill_std, Sequence) and len(fill_std) == 3 \ + and all(isinstance(x, Number) for x in fill_std), \ + 'fill_std should be a float or Sequence with three int.' + + self.erase_prob = erase_prob + self.min_area_ratio = min_area_ratio + self.max_area_ratio = max_area_ratio + self.aspect_range = aspect_range + self.mode = mode + self.fill_color = fill_color + self.fill_std = fill_std + + def _img_fill_pixels(self, img, top, left, h, w): + """Fill pixels to the patch of image.""" + if self.mode == 'const': + patch = np.empty((h, w, 3), dtype=np.uint8) + patch[:, :] = np.array(self.fill_color, dtype=np.uint8) + elif self.fill_std is None: + # Uniform distribution + patch = np.random.uniform(0, 256, (h, w, 3)).astype(np.uint8) + else: + # Normal distribution + patch = np.random.normal(self.fill_color, self.fill_std, (h, w, 3)) + patch = np.clip(patch.astype(np.int32), 0, 255).astype(np.uint8) + + img[top:top + h, left:left + w] = patch + return img + + def _fill_pixels(self, imgs, top, left, h, w): + """Fill pixels to the patch of each image in frame clip.""" + return [self._img_fill_pixels(img, top, left, h, w) for img in imgs] + + @cache_randomness + def random_disable(self): + """Randomly disable the transform.""" + return np.random.rand() > self.erase_prob + + @cache_randomness + def random_patch(self, img_h, img_w): + """Randomly generate patch the erase.""" + # convert the aspect ratio to log space to equally handle width and + # height. + log_aspect_range = np.log( + np.array(self.aspect_range, dtype=np.float32)) + aspect_ratio = np.exp(np.random.uniform(*log_aspect_range)) + area = img_h * img_w + area *= np.random.uniform(self.min_area_ratio, self.max_area_ratio) + + h = min(int(round(np.sqrt(area * aspect_ratio))), img_h) + w = min(int(round(np.sqrt(area / aspect_ratio))), img_w) + top = np.random.randint(0, img_h - h) if img_h > h else 0 + left = np.random.randint(0, img_w - w) if img_w > w else 0 + return top, left, h, w + + def transform(self, results): + """ + Args: + results (dict): Results dict from pipeline + + Returns: + dict: Results after the transformation. + """ + if self.random_disable(): + return results + + imgs = results['imgs'] + img_h, img_w = imgs[0].shape[:2] + + imgs = self._fill_pixels(imgs, *self.random_patch(img_h, img_w)) + + results['imgs'] = imgs + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(erase_prob={self.erase_prob}, ' + repr_str += f'min_area_ratio={self.min_area_ratio}, ' + repr_str += f'max_area_ratio={self.max_area_ratio}, ' + repr_str += f'aspect_range={self.aspect_range}, ' + repr_str += f'mode={self.mode}, ' + repr_str += f'fill_color={self.fill_color}, ' + repr_str += f'fill_std={self.fill_std})' + return repr_str diff --git a/mmaction/models/backbones/__init__.py b/mmaction/models/backbones/__init__.py index 30301b2b28..6a2c7b526a 100644 --- a/mmaction/models/backbones/__init__.py +++ b/mmaction/models/backbones/__init__.py @@ -4,6 +4,7 @@ from .c3d import C3D from .mobilenet_v2 import MobileNetV2 from .mobilenet_v2_tsm import MobileNetV2TSM +from .mvit import MViT from .resnet import ResNet from .resnet2plus1d import ResNet2Plus1d from .resnet3d import ResNet3d, ResNet3dLayer @@ -24,5 +25,6 @@ 'C2D', 'C3D', 'ResNet', 'ResNet3d', 'ResNetTSM', 'ResNet2Plus1d', 'ResNet3dSlowFast', 'ResNet3dSlowOnly', 'ResNet3dCSN', 'ResNetTIN', 'X3D', 'ResNet3dLayer', 'MobileNetV2TSM', 'MobileNetV2', 'TANet', 'TimeSformer', - 'STGCN', 'AGCN', 'ResNetAudio', 'SwinTransformer3D', 'VisionTransformer' + 'STGCN', 'AGCN', 'ResNetAudio', 'SwinTransformer3D', 'VisionTransformer', + 'MViT' ] diff --git a/mmaction/models/backbones/mvit.py b/mmaction/models/backbones/mvit.py new file mode 100644 index 0000000000..7974767cfc --- /dev/null +++ b/mmaction/models/backbones/mvit.py @@ -0,0 +1,850 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import build_activation_layer, build_norm_layer +from mmcv.cnn.bricks import DropPath +from mmengine.model import BaseModule, ModuleList +from mmengine.model.weight_init import trunc_normal_ +from mmengine.utils import to_3tuple + +from mmaction.registry import MODELS +from ..utils.embed import PatchEmbed3D + + +def resize_pos_embed(pos_embed, + src_shape, + dst_shape, + mode='trilinear', + num_extra_tokens=1): + """Resize pos_embed weights. + + Args: + pos_embed (torch.Tensor): Position embedding weights with shape + [1, L, C]. + src_shape (tuple): The resolution of downsampled origin training + image, in format (T, H, W). + dst_shape (tuple): The resolution of downsampled new training + image, in format (T, H, W). + mode (str): Algorithm used for upsampling. Choose one from 'nearest', + 'linear', 'bilinear', 'bicubic' and 'trilinear'. + Defaults to 'trilinear'. + num_extra_tokens (int): The number of extra tokens, such as cls_token. + Defaults to 1. + + Returns: + torch.Tensor: The resized pos_embed of shape [1, L_new, C] + """ + if src_shape[0] == dst_shape[0] and src_shape[1] == dst_shape[1] \ + and src_shape[2] == dst_shape[2]: + return pos_embed + assert pos_embed.ndim == 3, 'shape of pos_embed must be [1, L, C]' + _, L, C = pos_embed.shape + src_t, src_h, src_w = src_shape + assert L == src_t * src_h * src_w + num_extra_tokens, \ + f"The length of `pos_embed` ({L}) doesn't match the expected " \ + f'shape ({src_t}*{src_h}*{src_w}+{num_extra_tokens}).' \ + 'Please check the `img_size` argument.' + extra_tokens = pos_embed[:, :num_extra_tokens] + + src_weight = pos_embed[:, num_extra_tokens:] + src_weight = src_weight.reshape(1, src_t, src_h, src_w, + C).permute(0, 4, 1, 2, 3) + + dst_weight = F.interpolate( + src_weight, size=dst_shape, align_corners=False, mode=mode) + dst_weight = torch.flatten(dst_weight, 2).transpose(1, 2) + + return torch.cat((extra_tokens, dst_weight), dim=1) + + +def resize_decomposed_rel_pos(rel_pos, q_size, k_size): + """Get relative positional embeddings according to the relative positions + of query and key sizes. + + Args: + rel_pos (Tensor): relative position embeddings (L, C). + q_size (int): size of query q. + k_size (int): size of key k. + + Returns: + Extracted positional embeddings according to relative positions. + """ + max_rel_dist = int(2 * max(q_size, k_size) - 1) + # Interpolate rel pos if needed. + if rel_pos.shape[0] != max_rel_dist: + # Interpolate rel pos. + resized = F.interpolate( + # (L, C) -> (1, C, L) + rel_pos.transpose(0, 1).unsqueeze(0), + size=max_rel_dist, + mode='linear', + ) + # (1, C, L) -> (L, C) + resized = resized.squeeze(0).transpose(0, 1) + else: + resized = rel_pos + + # Scale the coords with short length if shapes for q and k are different. + q_h_ratio = max(k_size / q_size, 1.0) + k_h_ratio = max(q_size / k_size, 1.0) + q_coords = torch.arange(q_size)[:, None] * q_h_ratio + k_coords = torch.arange(k_size)[None, :] * k_h_ratio + relative_coords = (q_coords - k_coords) + (k_size - 1) * k_h_ratio + + return resized[relative_coords.long()] + + +def add_decomposed_rel_pos(attn, + q, + q_shape, + k_shape, + rel_pos_h, + rel_pos_w, + rel_pos_t, + with_cls_token=False): + """Spatiotemporal Relative Positional Embeddings.""" + sp_idx = 1 if with_cls_token else 0 + B, num_heads, _, C = q.shape + q_t, q_h, q_w = q_shape + k_t, k_h, k_w = k_shape + + Rt = resize_decomposed_rel_pos(rel_pos_t, q_t, k_t) + Rh = resize_decomposed_rel_pos(rel_pos_h, q_h, k_h) + Rw = resize_decomposed_rel_pos(rel_pos_w, q_w, k_w) + + r_q = q[:, :, sp_idx:].reshape(B, num_heads, q_t, q_h, q_w, C) + rel_t = torch.einsum('bythwc,tkc->bythwk', r_q, Rt) + rel_h = torch.einsum('bythwc,hkc->bythwk', r_q, Rh) + rel_w = torch.einsum('bythwc,wkc->bythwk', r_q, Rw) + rel_pos_embed = ( + rel_t[:, :, :, :, :, :, None, None] + + rel_h[:, :, :, :, :, None, :, None] + + rel_w[:, :, :, :, :, None, None, :]) + + attn_map = attn[:, :, sp_idx:, sp_idx:].view(B, -1, q_t, q_h, q_w, k_t, + k_h, k_w) + attn_map += rel_pos_embed + attn[:, :, sp_idx:, sp_idx:] = attn_map.view(B, -1, q_t * q_h * q_w, + k_t * k_h * k_w) + + return attn + + +class MLP(BaseModule): + """Two-layer multilayer perceptron. + + Comparing with :class:`mmcv.cnn.bricks.transformer.FFN`, this class allows + different input and output channel numbers. + + Args: + in_channels (int): The number of input channels. + hidden_channels (int, optional): The number of hidden layer channels. + If None, same as the ``in_channels``. Defaults to None. + out_channels (int, optional): The number of output channels. If None, + same as the ``in_channels``. Defaults to None. + act_cfg (dict): The config of activation function. + Defaults to ``dict(type='GELU')``. + init_cfg (dict, optional): The config of weight initialization. + Defaults to None. + """ + + def __init__(self, + in_channels, + hidden_channels=None, + out_channels=None, + act_cfg=dict(type='GELU'), + init_cfg=None): + super().__init__(init_cfg=init_cfg) + out_channels = out_channels or in_channels + hidden_channels = hidden_channels or in_channels + self.fc1 = nn.Linear(in_channels, hidden_channels) + self.act = build_activation_layer(act_cfg) + self.fc2 = nn.Linear(hidden_channels, out_channels) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.fc2(x) + return x + + +def attention_pool(x: torch.Tensor, + pool: nn.Module, + in_size: tuple, + with_cls_token: bool = False, + norm: Optional[nn.Module] = None): + """Pooling the feature tokens. + + Args: + x (torch.Tensor): The input tensor, should be with shape + ``(B, num_heads, L, C)`` or ``(B, L, C)``. + pool (nn.Module): The pooling module. + in_size (Tuple[int]): The shape of the input feature map. + with_cls_token (bool): Whether concatenating class token into video + tokens as transformer input. Defaults to True. + norm (nn.Module, optional): The normalization module. + Defaults to None. + """ + ndim = x.ndim + if ndim == 4: + B, num_heads, L, C = x.shape + elif ndim == 3: + num_heads = 1 + B, L, C = x.shape + x = x.unsqueeze(1) + else: + raise RuntimeError(f'Unsupported input dimension {x.shape}') + + T, H, W = in_size + assert L == T * H * W + with_cls_token + + if with_cls_token: + cls_tok, x = x[:, :, :1, :], x[:, :, 1:, :] + + # (B, num_heads, T*H*W, C) -> (B*num_heads, C, T, H, W) + x = x.reshape(B * num_heads, T, H, W, C).permute(0, 4, 1, 2, + 3).contiguous() + x = pool(x) + out_size = x.shape[2:] + + # (B*num_heads, C, T', H', W') -> (B, num_heads, T'*H'*W', C) + x = x.reshape(B, num_heads, C, -1).transpose(2, 3) + + if with_cls_token: + x = torch.cat((cls_tok, x), dim=2) + + if norm is not None: + x = norm(x) + + if ndim == 3: + x = x.squeeze(1) + + return x, out_size + + +class MultiScaleAttention(BaseModule): + """Multiscale Multi-head Attention block. + + Args: + in_dims (int): Number of input channels. + out_dims (int): Number of output channels. + num_heads (int): Number of attention heads. + qkv_bias (bool): If True, add a learnable bias to query, key and + value. Defaults to True. + norm_cfg (dict): The config of normalization layers. + Defaults to ``dict(type='LN')``. + pool_kernel (tuple): kernel size for qkv pooling layers. + Defaults to (3, 3, 3). + stride_q (int): stride size for q pooling layer. + Defaults to (1, 1, 1). + stride_kv (int): stride size for kv pooling layer. + Defaults to (1, 1, 1). + rel_pos_embed (bool): Whether to enable the spatial and temporal + relative position embedding. Defaults to True. + residual_pooling (bool): Whether to enable the residual connection + after attention pooling. Defaults to True. + input_size (Tuple[int], optional): The input resolution, necessary + if enable the ``rel_pos_embed``. Defaults to None. + rel_pos_zero_init (bool): If True, zero initialize relative + positional parameters. Defaults to False. + with_cls_token (bool): Whether concatenating class token into video + tokens as transformer input. Defaults to True. + init_cfg (dict, optional): The config of weight initialization. + Defaults to None. + """ + + def __init__(self, + in_dims, + out_dims, + num_heads, + qkv_bias=True, + norm_cfg=dict(type='LN'), + pool_kernel=(3, 3, 3), + stride_q=(1, 1, 1), + stride_kv=(1, 1, 1), + rel_pos_embed=True, + residual_pooling=True, + input_size=None, + rel_pos_zero_init=False, + with_cls_token=True, + init_cfg=None): + super().__init__(init_cfg=init_cfg) + self.num_heads = num_heads + self.with_cls_token = with_cls_token + self.in_dims = in_dims + self.out_dims = out_dims + + head_dim = out_dims // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(in_dims, out_dims * 3, bias=qkv_bias) + self.proj = nn.Linear(out_dims, out_dims) + + # qkv pooling + pool_padding = [k // 2 for k in pool_kernel] + pool_dims = out_dims // num_heads + + def build_pooling(stride): + pool = nn.Conv3d( + pool_dims, + pool_dims, + pool_kernel, + stride=stride, + padding=pool_padding, + groups=pool_dims, + bias=False, + ) + norm = build_norm_layer(norm_cfg, pool_dims)[1] + return pool, norm + + self.pool_q, self.norm_q = build_pooling(stride_q) + self.pool_k, self.norm_k = build_pooling(stride_kv) + self.pool_v, self.norm_v = build_pooling(stride_kv) + + self.residual_pooling = residual_pooling + + self.rel_pos_embed = rel_pos_embed + self.rel_pos_zero_init = rel_pos_zero_init + if self.rel_pos_embed: + # initialize relative positional embeddings + assert input_size[1] == input_size[2] + + size = input_size[1] + rel_dim = 2 * max(size // stride_q[1], size // stride_kv[1]) - 1 + self.rel_pos_h = nn.Parameter(torch.zeros(rel_dim, head_dim)) + self.rel_pos_w = nn.Parameter(torch.zeros(rel_dim, head_dim)) + self.rel_pos_t = nn.Parameter( + torch.zeros(2 * input_size[0] - 1, head_dim)) + + def init_weights(self): + """Weight initialization.""" + super().init_weights() + + if (isinstance(self.init_cfg, dict) + and self.init_cfg['type'] == 'Pretrained'): + # Suppress rel_pos_zero_init if use pretrained model. + return + + if not self.rel_pos_zero_init: + trunc_normal_(self.rel_pos_h, std=0.02) + trunc_normal_(self.rel_pos_w, std=0.02) + if not self.rel_pos_zero_init: + trunc_normal_(self.rel_pos_t, std=0.02) + + def forward(self, x, in_size): + """Forward the MultiScaleAttention.""" + B, N, _ = x.shape # (B, H*W, C) + + # qkv: (B, H*W, 3, num_heads, C) + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, -1) + # q, k, v: (B, num_heads, H*W, C) + q, k, v = qkv.permute(2, 0, 3, 1, 4).unbind(0) + + q, q_shape = attention_pool( + q, + self.pool_q, + in_size, + norm=self.norm_q, + with_cls_token=self.with_cls_token) + k, k_shape = attention_pool( + k, + self.pool_k, + in_size, + norm=self.norm_k, + with_cls_token=self.with_cls_token) + v, v_shape = attention_pool( + v, + self.pool_v, + in_size, + norm=self.norm_v, + with_cls_token=self.with_cls_token) + + attn = (q * self.scale) @ k.transpose(-2, -1) + if self.rel_pos_embed: + attn = add_decomposed_rel_pos(attn, q, q_shape, k_shape, + self.rel_pos_h, self.rel_pos_w, + self.rel_pos_t, self.with_cls_token) + + attn = attn.softmax(dim=-1) + x = attn @ v + + if self.residual_pooling: + if self.with_cls_token: + x[:, :, 1:, :] += q[:, :, 1:, :] + else: + x = x + q + + # (B, num_heads, H'*W', C'//num_heads) -> (B, H'*W', C') + x = x.transpose(1, 2).reshape(B, -1, self.out_dims) + x = self.proj(x) + + return x, q_shape + + +class MultiScaleBlock(BaseModule): + """Multiscale Transformer blocks. + + Args: + in_dims (int): Number of input channels. + out_dims (int): Number of output channels. + num_heads (int): Number of attention heads. + mlp_ratio (float): Ratio of hidden dimensions in MLP layers. + Defaults to 4.0. + qkv_bias (bool): If True, add a learnable bias to query, key and + value. Defaults to True. + drop_path (float): Stochastic depth rate. Defaults to 0. + norm_cfg (dict): The config of normalization layers. + Defaults to ``dict(type='LN')``. + act_cfg (dict): The config of activation function. + Defaults to ``dict(type='GELU')``. + qkv_pool_kernel (tuple): kernel size for qkv pooling layers. + Defaults to (3, 3, 3). + stride_q (int): stride size for q pooling layer. + Defaults to (1, 1, 1). + stride_kv (int): stride size for kv pooling layer. + Defaults to (1, 1, 1). + rel_pos_embed (bool): Whether to enable the spatial relative + position embedding. Defaults to True. + residual_pooling (bool): Whether to enable the residual connection + after attention pooling. Defaults to True. + with_cls_token (bool): Whether concatenating class token into video + tokens as transformer input. Defaults to True. + dim_mul_in_attention (bool): Whether to multiply the ``embed_dims`` in + attention layers. If False, multiply it in MLP layers. + Defaults to True. + input_size (Tuple[int], optional): The input resolution, necessary + if enable the ``rel_pos_embed``. Defaults to None. + rel_pos_zero_init (bool): If True, zero initialize relative + positional parameters. Defaults to False. + init_cfg (dict, optional): The config of weight initialization. + Defaults to None. + """ + + def __init__( + self, + in_dims, + out_dims, + num_heads, + mlp_ratio=4.0, + qkv_bias=True, + drop_path=0.0, + norm_cfg=dict(type='LN'), + act_cfg=dict(type='GELU'), + qkv_pool_kernel=(3, 3, 3), + stride_q=(1, 1, 1), + stride_kv=(1, 1, 1), + rel_pos_embed=True, + residual_pooling=True, + with_cls_token=True, + dim_mul_in_attention=True, + input_size=None, + rel_pos_zero_init=False, + init_cfg=None, + ): + super().__init__(init_cfg=init_cfg) + self.with_cls_token = with_cls_token + self.in_dims = in_dims + self.out_dims = out_dims + self.norm1 = build_norm_layer(norm_cfg, in_dims)[1] + self.dim_mul_in_attention = dim_mul_in_attention + + attn_dims = out_dims if dim_mul_in_attention else in_dims + self.attn = MultiScaleAttention( + in_dims, + attn_dims, + num_heads=num_heads, + qkv_bias=qkv_bias, + norm_cfg=norm_cfg, + pool_kernel=qkv_pool_kernel, + stride_q=stride_q, + stride_kv=stride_kv, + rel_pos_embed=rel_pos_embed, + residual_pooling=residual_pooling, + input_size=input_size, + rel_pos_zero_init=rel_pos_zero_init) + self.drop_path = DropPath( + drop_path) if drop_path > 0.0 else nn.Identity() + + self.norm2 = build_norm_layer(norm_cfg, attn_dims)[1] + + self.mlp = MLP( + in_channels=attn_dims, + hidden_channels=int(attn_dims * mlp_ratio), + out_channels=out_dims, + act_cfg=act_cfg) + + if in_dims != out_dims: + self.proj = nn.Linear(in_dims, out_dims) + else: + self.proj = None + + if np.prod(stride_q) > 1: + kernel_skip = [s + 1 if s > 1 else s for s in stride_q] + padding_skip = [int(skip // 2) for skip in kernel_skip] + self.pool_skip = nn.MaxPool3d( + kernel_skip, stride_q, padding_skip, ceil_mode=False) + + if input_size is not None: + input_size = to_3tuple(input_size) + out_size = [size // s for size, s in zip(input_size, stride_q)] + self.init_out_size = out_size + else: + self.init_out_size = None + else: + self.pool_skip = None + self.init_out_size = input_size + + def forward(self, x, in_size): + x_norm = self.norm1(x) + x_attn, out_size = self.attn(x_norm, in_size) + + if self.dim_mul_in_attention and self.proj is not None: + skip = self.proj(x_norm) + else: + skip = x + + if self.pool_skip is not None: + skip, _ = attention_pool( + skip, + self.pool_skip, + in_size, + with_cls_token=self.with_cls_token) + + x = skip + self.drop_path(x_attn) + x_norm = self.norm2(x) + x_mlp = self.mlp(x_norm) + + if not self.dim_mul_in_attention and self.proj is not None: + skip = self.proj(x_norm) + else: + skip = x + + x = skip + self.drop_path(x_mlp) + + return x, out_size + + +@MODELS.register_module() +class MViT(BaseModule): + """Multi-scale ViT v2. + + A PyTorch implement of : `MViTv2: Improved Multiscale Vision Transformers + for Classification and Detection `_ + + Inspiration from `the official implementation + `_ and `the mmclassification + implementation `_ + + Args: + arch (str | dict): MViT architecture. If use string, choose + from 'tiny', 'small', 'base' and 'large'. If use dict, it should + have below keys: + + - **embed_dims** (int): The dimensions of embedding. + - **num_layers** (int): The number of layers. + - **num_heads** (int): The number of heads in attention + modules of the initial layer. + - **downscale_indices** (List[int]): The layer indices to downscale + the feature map. + + Defaults to 'base'. + spatial_size (int): The expected input spatial_size shape. + Defaults to 224. + temporal_size (int): The expected input temporal_size shape. + Defaults to 224. + in_channels (int): The num of input channels. Defaults to 3. + out_scales (int | Sequence[int]): The output scale indices. + They should not exceed the length of ``downscale_indices``. + Defaults to -1, which means the last scale. + drop_path_rate (float): Stochastic depth rate. Defaults to 0.1. + use_abs_pos_embed (bool): If True, add absolute position embedding to + the patch embedding. Defaults to False. + interpolate_mode (str): Select the interpolate mode for absolute + position embedding vector resize. Defaults to "trilinear". + pool_kernel (tuple): kernel size for qkv pooling layers. + Defaults to (3, 3, 3). + dim_mul (int): The magnification for ``embed_dims`` in the downscale + layers. Defaults to 2. + head_mul (int): The magnification for ``num_heads`` in the downscale + layers. Defaults to 2. + adaptive_kv_stride (int): The stride size for kv pooling in the initial + layer. Defaults to (1, 8, 8). + rel_pos_embed (bool): Whether to enable the spatial and temporal + relative position embedding. Defaults to True. + residual_pooling (bool): Whether to enable the residual connection + after attention pooling. Defaults to True. + dim_mul_in_attention (bool): Whether to multiply the ``embed_dims`` in + attention layers. If False, multiply it in MLP layers. + Defaults to True. + with_cls_token (bool): Whether concatenating class token into video + tokens as transformer input. Defaults to True. + output_cls_token (bool): Whether output the cls_token. If set True, + ``with_cls_token`` must be True. Defaults to True. + rel_pos_zero_init (bool): If True, zero initialize relative + positional parameters. Defaults to False. + mlp_ratio (float): Ratio of hidden dimensions in MLP layers. + Defaults to 4.0. + qkv_bias (bool): enable bias for qkv if True. Defaults to True. + norm_cfg (dict): Config dict for normalization layer for all output + features. Defaults to ``dict(type='LN', eps=1e-6)``. + patch_cfg (dict): Config dict for the patch embedding layer. + Defaults to + ``dict(kernel_size=(3, 7, 7), + stride=(2, 4, 4), + padding=(1, 3, 3))``. + init_cfg (dict, optional): The Config for initialization. + Defaults to None. + + Examples: + >>> import torch + >>> from mmaction.registry import MODELS + >>> from mmaction.utils import register_all_modules + >>> register_all_modules() + >>> + >>> cfg = dict(type='MViT', arch='tiny', out_scales=[0, 1, 2, 3]) + >>> model = model = MODELS.build(cfg) + >>> inputs = torch.rand(1, 3, 16, 224, 224) + >>> outputs = model(inputs) + >>> for i, output in enumerate(outputs): + >>> print(f'scale{i}: {output.shape}') + scale0: torch.Size([1, 96, 8, 56, 56]) + scale1: torch.Size([1, 192, 8, 28, 28]) + scale2: torch.Size([1, 384, 8, 14, 14]) + scale3: torch.Size([1, 768, 8, 7, 7]) + """ + arch_zoo = { + 'tiny': { + 'embed_dims': 96, + 'num_layers': 10, + 'num_heads': 1, + 'downscale_indices': [1, 3, 8] + }, + 'small': { + 'embed_dims': 96, + 'num_layers': 16, + 'num_heads': 1, + 'downscale_indices': [1, 3, 14] + }, + 'base': { + 'embed_dims': 96, + 'num_layers': 24, + 'num_heads': 1, + 'downscale_indices': [2, 5, 21] + }, + 'large': { + 'embed_dims': 144, + 'num_layers': 48, + 'num_heads': 2, + 'downscale_indices': [2, 8, 44] + }, + } + num_extra_tokens = 1 + + def __init__(self, + arch='base', + spatial_size=224, + temporal_size=16, + in_channels=3, + pretrained=None, + out_scales=-1, + drop_path_rate=0., + use_abs_pos_embed=False, + interpolate_mode='trilinear', + pool_kernel=(3, 3, 3), + dim_mul=2, + head_mul=2, + adaptive_kv_stride=(1, 8, 8), + rel_pos_embed=True, + residual_pooling=True, + dim_mul_in_attention=True, + with_cls_token=True, + output_cls_token=True, + rel_pos_zero_init=False, + mlp_ratio=4., + qkv_bias=True, + norm_cfg=dict(type='LN', eps=1e-6), + patch_cfg=dict( + kernel_size=(3, 7, 7), + stride=(2, 4, 4), + padding=(1, 3, 3)), + init_cfg=None): + if pretrained: + init_cfg = dict(type='Pretrained', checkpoint=pretrained) + super().__init__(init_cfg=init_cfg) + + if isinstance(arch, str): + arch = arch.lower() + assert arch in set(self.arch_zoo), \ + f'Arch {arch} is not in default archs {set(self.arch_zoo)}' + self.arch_settings = self.arch_zoo[arch] + else: + essential_keys = { + 'embed_dims', 'num_layers', 'num_heads', 'downscale_indices' + } + assert isinstance(arch, dict) and essential_keys <= set(arch), \ + f'Custom arch needs a dict with keys {essential_keys}' + self.arch_settings = arch + + self.embed_dims = self.arch_settings['embed_dims'] + self.num_layers = self.arch_settings['num_layers'] + self.num_heads = self.arch_settings['num_heads'] + self.downscale_indices = self.arch_settings['downscale_indices'] + self.num_scales = len(self.downscale_indices) + 1 + self.stage_indices = { + index - 1: i + for i, index in enumerate(self.downscale_indices) + } + self.stage_indices[self.num_layers - 1] = self.num_scales - 1 + self.use_abs_pos_embed = use_abs_pos_embed + self.interpolate_mode = interpolate_mode + + if isinstance(out_scales, int): + out_scales = [out_scales] + assert isinstance(out_scales, Sequence), \ + f'"out_scales" must by a sequence or int, ' \ + f'get {type(out_scales)} instead.' + for i, index in enumerate(out_scales): + if index < 0: + out_scales[i] = self.num_scales + index + assert 0 <= out_scales[i] <= self.num_scales, \ + f'Invalid out_scales {index}' + self.out_scales = sorted(list(out_scales)) + + # Set patch embedding + _patch_cfg = dict( + in_channels=in_channels, + input_size=(temporal_size, spatial_size, spatial_size), + embed_dims=self.embed_dims, + conv_type='Conv3d', + ) + _patch_cfg.update(patch_cfg) + self.patch_embed = PatchEmbed3D(**_patch_cfg) + self.patch_resolution = self.patch_embed.init_out_size + + # Set cls token + if output_cls_token: + assert with_cls_token is True, f'with_cls_token must be True if' \ + f'set output_cls_token to True, but got {with_cls_token}' + self.with_cls_token = with_cls_token + self.output_cls_token = output_cls_token + self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dims)) + + # Set absolute position embedding + if self.use_abs_pos_embed: + num_patches = np.prod(self.patch_resolution) + self.pos_embed = nn.Parameter( + torch.zeros(1, num_patches + self.num_extra_tokens, + self.embed_dims)) + + # stochastic depth decay rule + dpr = np.linspace(0, drop_path_rate, self.num_layers) + + self.blocks = ModuleList() + out_dims_list = [self.embed_dims] + num_heads = self.num_heads + stride_kv = adaptive_kv_stride + input_size = self.patch_resolution + for i in range(self.num_layers): + if i in self.downscale_indices: + num_heads *= head_mul + stride_q = [1, 2, 2] + stride_kv = [max(s // 2, 1) for s in stride_kv] + else: + stride_q = [1, 1, 1] + + # Set output embed_dims + if dim_mul_in_attention and i in self.downscale_indices: + # multiply embed_dims in downscale layers. + out_dims = out_dims_list[-1] * dim_mul + elif not dim_mul_in_attention and i + 1 in self.downscale_indices: + # multiply embed_dims before downscale layers. + out_dims = out_dims_list[-1] * dim_mul + else: + out_dims = out_dims_list[-1] + + attention_block = MultiScaleBlock( + in_dims=out_dims_list[-1], + out_dims=out_dims, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + drop_path=dpr[i], + norm_cfg=norm_cfg, + qkv_pool_kernel=pool_kernel, + stride_q=stride_q, + stride_kv=stride_kv, + rel_pos_embed=rel_pos_embed, + residual_pooling=residual_pooling, + dim_mul_in_attention=dim_mul_in_attention, + input_size=input_size, + rel_pos_zero_init=rel_pos_zero_init) + self.blocks.append(attention_block) + + input_size = attention_block.init_out_size + out_dims_list.append(out_dims) + + if i in self.stage_indices: + stage_index = self.stage_indices[i] + if stage_index in self.out_scales: + norm_layer = build_norm_layer(norm_cfg, out_dims)[1] + self.add_module(f'norm{stage_index}', norm_layer) + + def init_weights(self): + super().init_weights() + + if (isinstance(self.init_cfg, dict) + and self.init_cfg['type'] == 'Pretrained'): + # Suppress default init if use pretrained model. + return + + if self.use_abs_pos_embed: + trunc_normal_(self.pos_embed, std=0.02) + + def forward(self, x): + """Forward the MViT.""" + B = x.shape[0] + x, patch_resolution = self.patch_embed(x) + + cls_tokens = self.cls_token.expand(B, -1, -1) + x = torch.cat((cls_tokens, x), dim=1) + + if self.use_abs_pos_embed: + x = x + resize_pos_embed( + self.pos_embed, + self.patch_resolution, + patch_resolution, + mode=self.interpolate_mode, + num_extra_tokens=self.num_extra_tokens) + + if not self.with_cls_token: + # Remove class token for transformer encoder input + x = x[:, 1:] + + outs = [] + for i, block in enumerate(self.blocks): + x, patch_resolution = block(x, patch_resolution) + + if i in self.stage_indices: + stage_index = self.stage_indices[i] + if stage_index in self.out_scales: + B, _, C = x.shape + x = getattr(self, f'norm{stage_index}')(x) + tokens = x.transpose(1, 2) + if self.with_cls_token: + patch_token = tokens[:, :, 1:].reshape( + B, C, *patch_resolution) + cls_token = tokens[:, :, 0] + else: + patch_token = tokens.reshape(B, C, *patch_resolution) + cls_token = None + if self.output_cls_token: + out = [patch_token, cls_token] + else: + out = patch_token + outs.append(out) + + return tuple(outs) diff --git a/mmaction/models/heads/__init__.py b/mmaction/models/heads/__init__.py index 79f852dc26..3395f96fe3 100644 --- a/mmaction/models/heads/__init__.py +++ b/mmaction/models/heads/__init__.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from .base import BaseHead from .i3d_head import I3DHead +from .mvit_head import MVitHead from .slowfast_head import SlowFastHead from .stgcn_head import STGCNHead from .timesformer_head import TimeSformerHead @@ -13,5 +14,6 @@ __all__ = [ 'TSNHead', 'I3DHead', 'BaseHead', 'TSMHead', 'SlowFastHead', 'TPNHead', - 'X3DHead', 'TRNHead', 'TimeSformerHead', 'STGCNHead', 'TSNAudioHead' + 'X3DHead', 'TRNHead', 'TimeSformerHead', 'STGCNHead', 'TSNAudioHead', + 'MVitHead' ] diff --git a/mmaction/models/heads/mvit_head.py b/mmaction/models/heads/mvit_head.py new file mode 100644 index 0000000000..eac4d30266 --- /dev/null +++ b/mmaction/models/heads/mvit_head.py @@ -0,0 +1,71 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +from mmengine.model.weight_init import trunc_normal_init +from torch import Tensor, nn + +from mmaction.registry import MODELS +from mmaction.utils import ConfigType +from .base import BaseHead + + +@MODELS.register_module() +class MVitHead(BaseHead): + """Classification head for TimeSformer. + + Args: + num_classes (int): Number of classes to be classified. + in_channels (int): Number of channels in input feature. + loss_cls (dict or ConfigDict): Config for building loss. + Defaults to `dict(type='CrossEntropyLoss')`. + dropout_ratio (float): Probability of dropout layer. Default: 0.5. + init_std (float): Std value for Initiation. Defaults to 0.02. + kwargs (dict, optional): Any keyword argument to be used to initialize + the head. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + loss_cls: ConfigType = dict(type='CrossEntropyLoss'), + dropout_ratio: float = 0.5, + init_std: float = 0.02, + **kwargs) -> None: + super().__init__(num_classes, in_channels, loss_cls, **kwargs) + self.init_std = init_std + self.dropout_ratio = dropout_ratio + if self.dropout_ratio != 0: + self.dropout = nn.Dropout(p=self.dropout_ratio) + else: + self.dropout = None + self.fc_cls = nn.Linear(self.in_channels, self.num_classes) + + def init_weights(self) -> None: + """Initiate the parameters from scratch.""" + trunc_normal_init(self.fc_cls, std=self.init_std) + + def pre_logits(self, feats: Tuple[List[Tensor]]) -> Tensor: + """The process before the final classification head. + + The input ``feats`` is a tuple of list of tensor, and each tensor is + the feature of a backbone stage. + """ + _, cls_token = feats[-1] + return cls_token + + def forward(self, x: Tuple[List[Tensor]], **kwargs) -> Tensor: + """Defines the computation performed at every call. + + Args: + x (Tuple[List[Tensor]]): The input data. + + Returns: + Tensor: The classification scores for input samples. + """ + x = self.pre_logits(x) + if self.dropout is not None: + x = self.dropout(x) + # [N, in_channels] + cls_score = self.fc_cls(x) + # [N, num_classes] + return cls_score diff --git a/mmaction/models/recognizers/recognizer3d.py b/mmaction/models/recognizers/recognizer3d.py index 9de211d618..bb7e250157 100644 --- a/mmaction/models/recognizers/recognizer3d.py +++ b/mmaction/models/recognizers/recognizer3d.py @@ -69,16 +69,21 @@ def extract_feat(self, feat, _ = self.neck(feat) feats.append(feat) view_ptr += max_testing_views - # should consider the case that feat is a tuple - if isinstance(feats[0], tuple): - len_tuple = len(feats[0]) - feats = [ - torch.cat([each[i] for each in feats]) - for i in range(len_tuple) - ] - x = tuple(feats) - else: - x = torch.cat(feats) + # recursively traverse feats until it's a tensor, then concat + + def recursively_cat(feats): + out_feats = [] + for e_idx, elem in enumerate(feats[0]): + batch_elem = [feat[e_idx] for feat in feats] + if not isinstance(elem, torch.Tensor): + batch_elem = recursively_cat(batch_elem) + else: + batch_elem = torch.cat(batch_elem) + out_feats.append(batch_elem) + + return tuple(out_feats) + + x = recursively_cat(feats) else: x = self.backbone(inputs) if self.with_neck: diff --git a/mmaction/models/utils/__init__.py b/mmaction/models/utils/__init__.py index ed6ac50522..865ccbea99 100644 --- a/mmaction/models/utils/__init__.py +++ b/mmaction/models/utils/__init__.py @@ -1,6 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. from .blending_utils import (BaseMiniBatchBlending, CutmixBlending, - MixupBlending) + MixupBlending, RandomBatchAugment) from .graph import Graph -__all__ = ['BaseMiniBatchBlending', 'CutmixBlending', 'MixupBlending', 'Graph'] +__all__ = [ + 'BaseMiniBatchBlending', 'CutmixBlending', 'MixupBlending', 'Graph', + 'RandomBatchAugment' +] diff --git a/mmaction/models/utils/blending_utils.py b/mmaction/models/utils/blending_utils.py index 94b929d7ff..64808d32f7 100644 --- a/mmaction/models/utils/blending_utils.py +++ b/mmaction/models/utils/blending_utils.py @@ -1,6 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABCMeta, abstractmethod +from typing import Union +import numpy as np import torch import torch.nn.functional as F from torch import Tensor @@ -177,3 +179,69 @@ def do_blending(self, imgs: Tensor, label: Tensor, **kwargs) -> tuple: label = lam * label + (1 - lam) * label[rand_index, :] return imgs, label + + +@MODELS.register_module() +class RandomBatchAugment(BaseMiniBatchBlending): + """Randomly choose one batch augmentation to apply. + + Args: + augments (dict | list): configs of batch + augmentations. + probs (float | List[float] | None): The probabilities of each batch + augmentations. If None, choose evenly. Defaults to None. + + Example: + >>> augments_cfg = [ + ... dict(type='CutmixBlending', alpha=1., num_classes=10), + ... dict(type='MixupBlending', alpha=1., num_classes=10) + ... ] + >>> batch_augment = RandomBatchAugment(augments_cfg, probs=[0.5, 0.3]) + >>> imgs = torch.randn(16, 3, 8, 32, 32) + >>> label = torch.randint(0, 10, (16, )) + >>> imgs, label = batch_augment(imgs, label) + + .. note :: + + To decide which batch augmentation will be used, it picks one of + ``augments`` based on the probabilities. In the example above, the + probability to use CutmixBlending is 0.5, to use MixupBlending is 0.3, + and to do nothing is 0.2. + """ + + def __init__(self, augments: Union[dict, list], probs=None): + if not isinstance(augments, (tuple, list)): + augments = [augments] + + self.augments = [] + for aug in augments: + assert isinstance(aug, dict), \ + f'blending augment config must be a dict. Got {type(aug)}' + self.augments.append(MODELS.build(aug)) + + self.num_classes = augments[0].get('num_classes') + + if isinstance(probs, float): + probs = [probs] + + if probs is not None: + assert len(augments) == len(probs), \ + '``augments`` and ``probs`` must have same lengths. ' \ + f'Got {len(augments)} vs {len(probs)}.' + assert sum(probs) <= 1, \ + 'The total probability of batch augments exceeds 1.' + self.augments.append(None) + probs.append(1 - sum(probs)) + + self.probs = probs + + def do_blending(self, imgs: Tensor, label: Tensor, **kwargs) -> tuple: + """Randomly apply batch augmentations to the batch inputs and batch + data samples.""" + aug_index = np.random.choice(len(self.augments), p=self.probs) + aug = self.augments[aug_index] + + if aug is not None: + return aug.do_blending(imgs, label, **kwargs) + else: + return imgs, label diff --git a/mmaction/models/utils/embed.py b/mmaction/models/utils/embed.py new file mode 100644 index 0000000000..bfe805fb32 --- /dev/null +++ b/mmaction/models/utils/embed.py @@ -0,0 +1,234 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import build_conv_layer, build_norm_layer +from mmengine.model import BaseModule +from mmengine.utils import to_3tuple + + +class AdaptivePadding(nn.Module): + """Applies padding adaptively to the input. + + This module can make input get fully covered by filter + you specified. It support two modes "same" and "corner". The + "same" mode is same with "SAME" padding mode in TensorFlow, pad + zero around input. The "corner" mode would pad zero + to bottom right. + + Args: + kernel_size (int | tuple): Size of the kernel. Default: 1. + stride (int | tuple): Stride of the filter. Default: 1. + dilation (int | tuple): Spacing between kernel elements. + Default: 1. + padding (str): Support "same" and "corner", "corner" mode + would pad zero to bottom right, and "same" mode would + pad zero around input. Default: "corner". + + Example: + >>> kernel_size = 16 + >>> stride = 16 + >>> dilation = 1 + >>> input = torch.rand(1, 1, 15, 17) + >>> adap_pad = AdaptivePadding( + >>> kernel_size=kernel_size, + >>> stride=stride, + >>> dilation=dilation, + >>> padding="corner") + >>> out = adap_pad(input) + >>> assert (out.shape[2], out.shape[3]) == (16, 32) + >>> input = torch.rand(1, 1, 16, 17) + >>> out = adap_pad(input) + >>> assert (out.shape[2], out.shape[3]) == (16, 32) + """ + + def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'): + super().__init__() + assert padding in ('same', 'corner') + + kernel_size = to_3tuple(kernel_size) + stride = to_3tuple(stride) + dilation = to_3tuple(dilation) + + self.padding = padding + self.kernel_size = kernel_size + self.stride = stride + self.dilation = dilation + + def get_pad_shape(self, input_shape): + """Calculate the padding size of input. + + Args: + input_shape (:obj:`torch.Size`): arrange as (H, W). + + Returns: + Tuple[int]: The padding size along the + original H and W directions + """ + input_t, input_h, input_w = input_shape + kernel_d, kernel_h, kernel_w = self.kernel_size + stride_d, stride_h, stride_w = self.stride + output_d = math.ceil(input_t / stride_d) + output_h = math.ceil(input_h / stride_h) + output_w = math.ceil(input_w / stride_w) + pad_d = max((output_d - 1) * stride_d + + (kernel_d - 1) * self.dilation[0] + 1 - input_t, 0) + pad_h = max((output_h - 1) * stride_h + + (kernel_h - 1) * self.dilation[1] + 1 - input_h, 0) + pad_w = max((output_w - 1) * stride_w + + (kernel_w - 1) * self.dilation[2] + 1 - input_w, 0) + return pad_d, pad_h, pad_w + + def forward(self, x): + """Add padding to `x` + + Args: + x (Tensor): Input tensor has shape (B, C, H, W). + + Returns: + Tensor: The tensor with adaptive padding + """ + pad_d, pad_h, pad_w = self.get_pad_shape(x.size()[-2:]) + if pad_d > 0 or pad_h > 0 or pad_w > 0: + if self.padding == 'corner': + x = F.pad(x, [0, pad_w, 0, pad_h, 0, pad_d]) + elif self.padding == 'same': + x = F.pad(x, [ + pad_w // 2, + pad_w - pad_w // 2, + pad_h // 2, + pad_h - pad_h // 2, + pad_d // 2, + pad_d - pad_d // 2, + ]) + return x + + +class PatchEmbed3D(BaseModule): + """Video to Patch Embedding. + + We use a conv layer to implement PatchEmbed. + + Args: + in_channels (int): The num of input channels. Default: 3 + embed_dims (int): The dimensions of embedding. Default: 768 + conv_type (str): The type of convolution + to generate patch embedding. Default: "Conv3d". + kernel_size (int): The kernel_size of embedding conv. + Default: (2, 4, 4). + stride (int): The slide stride of embedding conv. + Default: (2, 4, 4). + padding (int | tuple | string): The padding length of + embedding conv. When it is a string, it means the mode + of adaptive padding, support "same" and "corner" now. + Default: "corner". + dilation (int): The dilation rate of embedding conv. Default: 1. + bias (bool): Bias of embed conv. Default: True. + norm_cfg (dict, optional): Config dict for normalization layer. + Default: None. + input_size (int | tuple | None): The size of input, which will be + used to calculate the out size. Only works when `dynamic_size` + is False. Default: None. + init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization. + Default: None. + """ + + def __init__(self, + in_channels=3, + embed_dims=768, + conv_type='Conv3d', + kernel_size=(2, 4, 4), + stride=(2, 4, 4), + padding='corner', + dilation=1, + bias=True, + norm_cfg=None, + input_size=None, + init_cfg=None): + super().__init__(init_cfg=init_cfg) + + self.embed_dims = embed_dims + if stride is None: + stride = kernel_size + + kernel_size = to_3tuple(kernel_size) + stride = to_3tuple(stride) + dilation = to_3tuple(dilation) + + if isinstance(padding, str): + self.adaptive_padding = AdaptivePadding( + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + padding=padding) + # disable the padding of conv + padding = 0 + else: + self.adaptive_padding = None + padding = to_3tuple(padding) + + self.projection = build_conv_layer( + dict(type=conv_type), + in_channels=in_channels, + out_channels=embed_dims, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + bias=bias) + + if norm_cfg is not None: + self.norm = build_norm_layer(norm_cfg, embed_dims)[1] + else: + self.norm = None + + if input_size: + input_size = to_3tuple(input_size) + # `init_out_size` would be used outside to + # calculate the num_patches + # e.g. when `use_abs_pos_embed` outside + self.init_input_size = input_size + if self.adaptive_padding: + pad_d, pad_h, pad_w = self.adaptive_padding.get_pad_shape( + input_size) + input_t, input_h, input_w = input_size + input_t = input_t + pad_d + input_h = input_h + pad_h + input_w = input_w + pad_w + input_size = (input_t, input_h, input_w) + + # https://pytorch.org/docs/stable/generated/torch.nn.Conv3d.html + t_out = (input_size[0] + 2 * padding[0] - dilation[0] * + (kernel_size[0] - 1) - 1) // stride[0] + 1 + h_out = (input_size[1] + 2 * padding[1] - dilation[1] * + (kernel_size[1] - 1) - 1) // stride[1] + 1 + w_out = (input_size[2] + 2 * padding[2] - dilation[2] * + (kernel_size[2] - 1) - 1) // stride[2] + 1 + self.init_out_size = (t_out, h_out, w_out) + else: + self.init_input_size = None + self.init_out_size = None + + def forward(self, x): + """ + Args: + x (Tensor): Has shape (B, C, T, H, W). In most case, C is 3. + + Returns: + tuple: Contains merged results and its spatial shape. + + - x (Tensor): Has shape (B, out_t * out_h * out_w, embed_dims) + - out_size (tuple[int]): Spatial shape of x, arrange as + (out_t, out_h, out_w). + """ + + if self.adaptive_padding: + x = self.adaptive_padding(x) + + x = self.projection(x) + out_size = (x.shape[2], x.shape[3], x.shape[4]) + x = x.flatten(2).transpose(1, 2) + if self.norm is not None: + x = self.norm(x) + return x, out_size diff --git a/tests/datasets/transforms/test_pose_loading.py b/tests/datasets/transforms/test_pose_loading.py index eeb2dad84c..fd7568798f 100644 --- a/tests/datasets/transforms/test_pose_loading.py +++ b/tests/datasets/transforms/test_pose_loading.py @@ -10,97 +10,11 @@ from numpy.testing import assert_array_almost_equal, assert_array_equal from mmaction.datasets.transforms import (GeneratePoseTarget, LoadKineticsPose, - PaddingWithLoop, PoseDecode, - UniformSampleFrames) + PaddingWithLoop, PoseDecode) class TestPoseLoading: - @staticmethod - def test_uniform_sample_frames(): - results = dict(total_frames=64, start_index=0) - sampling = UniformSampleFrames( - clip_len=8, num_clips=1, test_mode=True, seed=0) - - assert str(sampling) == ('UniformSampleFrames(clip_len=8, ' - 'num_clips=1, test_mode=True, seed=0)') - sampling_results = sampling(results) - assert sampling_results['clip_len'] == 8 - assert sampling_results['frame_interval'] is None - assert sampling_results['num_clips'] == 1 - assert_array_equal(sampling_results['frame_inds'], - np.array([4, 15, 21, 24, 35, 43, 51, 63])) - - results = dict(total_frames=15, start_index=0) - sampling = UniformSampleFrames( - clip_len=8, num_clips=1, test_mode=True, seed=0) - sampling_results = sampling(results) - assert sampling_results['clip_len'] == 8 - assert sampling_results['frame_interval'] is None - assert sampling_results['num_clips'] == 1 - assert_array_equal(sampling_results['frame_inds'], - np.array([0, 2, 4, 6, 8, 9, 11, 13])) - - results = dict(total_frames=7, start_index=0) - sampling = UniformSampleFrames( - clip_len=8, num_clips=1, test_mode=True, seed=0) - sampling_results = sampling(results) - assert sampling_results['clip_len'] == 8 - assert sampling_results['frame_interval'] is None - assert sampling_results['num_clips'] == 1 - assert_array_equal(sampling_results['frame_inds'], - np.array([0, 1, 2, 3, 4, 5, 6, 0])) - - results = dict(total_frames=7, start_index=0) - sampling = UniformSampleFrames( - clip_len=8, num_clips=8, test_mode=True, seed=0) - sampling_results = sampling(results) - assert sampling_results['clip_len'] == 8 - assert sampling_results['frame_interval'] is None - assert sampling_results['num_clips'] == 8 - assert len(sampling_results['frame_inds']) == 64 - - results = dict(total_frames=64, start_index=0) - sampling = UniformSampleFrames( - clip_len=8, num_clips=4, test_mode=True, seed=0) - sampling_results = sampling(results) - assert sampling_results['clip_len'] == 8 - assert sampling_results['frame_interval'] is None - assert sampling_results['num_clips'] == 4 - assert_array_equal( - sampling_results['frame_inds'], - np.array([ - 4, 15, 21, 24, 35, 43, 51, 63, 1, 11, 21, 26, 36, 47, 54, 56, - 0, 12, 18, 25, 38, 47, 55, 62, 0, 9, 21, 25, 37, 40, 49, 60 - ])) - - results = dict(total_frames=64, start_index=0) - sampling = UniformSampleFrames( - clip_len=8, num_clips=1, test_mode=False, seed=0) - sampling_results = sampling(results) - assert sampling_results['clip_len'] == 8 - assert sampling_results['frame_interval'] is None - assert sampling_results['num_clips'] == 1 - assert len(sampling_results['frame_inds']) == 8 - - results = dict(total_frames=7, start_index=0) - sampling = UniformSampleFrames( - clip_len=8, num_clips=1, test_mode=False, seed=0) - sampling_results = sampling(results) - assert sampling_results['clip_len'] == 8 - assert sampling_results['frame_interval'] is None - assert sampling_results['num_clips'] == 1 - assert len(sampling_results['frame_inds']) == 8 - - results = dict(total_frames=15, start_index=0) - sampling = UniformSampleFrames( - clip_len=8, num_clips=1, test_mode=False, seed=0) - sampling_results = sampling(results) - assert sampling_results['clip_len'] == 8 - assert sampling_results['frame_interval'] is None - assert sampling_results['num_clips'] == 1 - assert len(sampling_results['frame_inds']) == 8 - @staticmethod def test_pose_decode(): kp = np.random.random([1, 16, 17, 2]) diff --git a/tests/datasets/transforms/test_sampling.py b/tests/datasets/transforms/test_sampling.py index f4a5e457bd..9450682315 100644 --- a/tests/datasets/transforms/test_sampling.py +++ b/tests/datasets/transforms/test_sampling.py @@ -9,7 +9,8 @@ from mmaction.datasets.transforms import (AudioFeatureSelector, DenseSampleFrames, SampleAVAFrames, - SampleFrames, UntrimmedSampleFrames) + SampleFrames, UniformSampleFrames, + UntrimmedSampleFrames) class BaseTestLoading: @@ -401,6 +402,90 @@ def check_monotonous(arr): assert np.max(sample_frames_results['frame_inds']) <= 40 assert np.min(sample_frames_results['frame_inds']) >= 1 + def test_uniform_sample_frames(self): + results = dict(total_frames=64, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=1, test_mode=True, seed=0) + + assert str(sampling) == ('UniformSampleFrames(clip_len=8, ' + 'num_clips=1, test_mode=True, seed=0)') + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert_array_equal(sampling_results['frame_inds'], + np.array([4, 15, 21, 24, 35, 43, 51, 63])) + + results = dict(total_frames=15, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=1, test_mode=True, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert_array_equal(sampling_results['frame_inds'], + np.array([0, 2, 4, 6, 8, 9, 11, 13])) + + results = dict(total_frames=7, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=1, test_mode=True, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert_array_equal(sampling_results['frame_inds'], + np.array([0, 1, 2, 3, 4, 5, 6, 0])) + + results = dict(total_frames=7, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=8, test_mode=True, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 8 + assert len(sampling_results['frame_inds']) == 64 + + results = dict(total_frames=64, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=4, test_mode=True, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 4 + assert_array_equal( + sampling_results['frame_inds'], + np.array([ + 4, 15, 21, 24, 35, 43, 51, 63, 1, 11, 21, 26, 36, 47, 54, 56, + 0, 12, 18, 25, 38, 47, 55, 62, 0, 9, 21, 25, 37, 40, 49, 60 + ])) + + results = dict(total_frames=64, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=1, test_mode=False, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert len(sampling_results['frame_inds']) == 8 + + results = dict(total_frames=7, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=1, test_mode=False, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert len(sampling_results['frame_inds']) == 8 + + results = dict(total_frames=15, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=1, test_mode=False, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert len(sampling_results['frame_inds']) == 8 + def test_dense_sample_frames(self): target_keys = [ 'frame_inds', 'clip_len', 'frame_interval', 'num_clips', diff --git a/tests/models/backbones/test_mvit.py b/tests/models/backbones/test_mvit.py new file mode 100644 index 0000000000..4ebdbc26db --- /dev/null +++ b/tests/models/backbones/test_mvit.py @@ -0,0 +1,134 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from copy import deepcopy +from unittest import TestCase + +import torch + +from mmaction.models import MViT + + +class TestMViT(TestCase): + + def setUp(self): + self.cfg = dict(arch='tiny', drop_path_rate=0.1) + + def test_structure(self): + # Test invalid default arch + with self.assertRaisesRegex(AssertionError, 'not in default archs'): + cfg = deepcopy(self.cfg) + cfg['arch'] = 'unknown' + MViT(**cfg) + + # Test invalid custom arch + with self.assertRaisesRegex(AssertionError, 'Custom arch needs'): + cfg = deepcopy(self.cfg) + cfg['arch'] = { + 'num_layers': 24, + 'num_heads': 16, + 'feedforward_channels': 4096 + } + MViT(**cfg) + + # Test custom arch + cfg = deepcopy(self.cfg) + cfg['arch'] = { + 'embed_dims': 96, + 'num_layers': 10, + 'num_heads': 1, + 'downscale_indices': [2, 5, 8] + } + stage_indices = [0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3] + model = MViT(**cfg) + self.assertEqual(model.embed_dims, 96) + self.assertEqual(model.num_layers, 10) + for i, block in enumerate(model.blocks): + stage = stage_indices[i] + self.assertEqual(block.out_dims, 96 * 2**(stage)) + + # Test out_indices + cfg = deepcopy(self.cfg) + cfg['out_scales'] = {1: 1} + with self.assertRaisesRegex(AssertionError, "get "): + MViT(**cfg) + cfg['out_scales'] = [0, 13] + with self.assertRaisesRegex(AssertionError, 'Invalid out_scales 13'): + MViT(**cfg) + + # Test model structure + cfg = deepcopy(self.cfg) + model = MViT(**cfg) + stage_indices = [0, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3] + self.assertEqual(len(model.blocks), 10) + dpr_inc = 0.1 / (10 - 1) + dpr = 0 + for i, block in enumerate(model.blocks): + stage = stage_indices[i] + print(i, stage) + self.assertEqual(block.attn.num_heads, 2**stage) + if dpr > 0: + self.assertAlmostEqual(block.drop_path.drop_prob, dpr) + dpr += dpr_inc + + def test_init_weights(self): + # test weight init cfg + cfg = deepcopy(self.cfg) + cfg['init_cfg'] = [ + dict( + type='Kaiming', + layer='Conv3d', + mode='fan_in', + nonlinearity='linear') + ] + cfg['use_abs_pos_embed'] = True + model = MViT(**cfg) + ori_weight = model.patch_embed.projection.weight.clone().detach() + # The pos_embed is all zero before initialize + self.assertTrue(torch.allclose(model.pos_embed, torch.tensor(0.))) + + model.init_weights() + initialized_weight = model.patch_embed.projection.weight + self.assertFalse(torch.allclose(ori_weight, initialized_weight)) + self.assertFalse(torch.allclose(model.pos_embed, torch.tensor(0.))) + + def test_forward(self): + imgs = torch.randn(1, 3, 16, 224, 224) + + cfg = deepcopy(self.cfg) + model = MViT(**cfg) + outs = model(imgs) + self.assertIsInstance(outs, tuple) + self.assertEqual(len(outs), 1) + patch_token, cls_token = outs[-1] + self.assertEqual(patch_token.shape, (1, 768, 8, 7, 7)) + + # Test forward with multi out scales + cfg = deepcopy(self.cfg) + cfg['out_scales'] = (0, 1, 2, 3) + model = MViT(**cfg) + outs = model(imgs) + self.assertIsInstance(outs, tuple) + self.assertEqual(len(outs), 4) + for stage, out in enumerate(outs): + stride = 2**stage + patch_token, cls_token = out + self.assertEqual(patch_token.shape, + (1, 96 * stride, 8, 56 // stride, 56 // stride)) + self.assertEqual(cls_token.shape, (1, 96 * stride)) + + # Test forward with dynamic input size + imgs1 = torch.randn(1, 3, 16, 224, 224) + imgs2 = torch.randn(1, 3, 16, 256, 256) + imgs3 = torch.randn(1, 3, 16, 256, 309) + cfg = deepcopy(self.cfg) + model = MViT(**cfg) + for imgs in [imgs1, imgs2, imgs3]: + outs = model(imgs) + self.assertIsInstance(outs, tuple) + self.assertEqual(len(outs), 1) + patch_token, cls_token = outs[-1] + expect_feat_shape = (math.ceil(imgs.shape[2] / 2), + math.ceil(imgs.shape[3] / 32), + math.ceil(imgs.shape[4] / 32)) + self.assertEqual(patch_token.shape, (1, 768, *expect_feat_shape)) + self.assertEqual(cls_token.shape, (1, 768)) diff --git a/tests/models/utils/test_blending_utils.py b/tests/models/utils/test_blending_utils.py index 2c19267681..359d9225dc 100644 --- a/tests/models/utils/test_blending_utils.py +++ b/tests/models/utils/test_blending_utils.py @@ -1,8 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import pytest import torch from mmengine.structures import LabelData -from mmaction.models import CutmixBlending, MixupBlending +from mmaction.models import CutmixBlending, MixupBlending, RandomBatchAugment from mmaction.structures import ActionDataSample @@ -53,3 +55,41 @@ def test_cutmix(): mixed_imgs, mixed_label = mixup(imgs, label) assert mixed_imgs.shape == torch.Size((4, 4, 2, 3, 32, 32)) assert len(mixed_label) == 4 + + +def test_rand_blend(): + alpha_mixup = 0.2 + alpha_cutmix = 0.2 + num_classes = 10 + label = get_label(torch.randint(0, num_classes, (4, ))) + blending_augs = [ + dict(type='MixupBlending', alpha=alpha_mixup, num_classes=num_classes), + dict( + type='CutmixBlending', alpha=alpha_cutmix, num_classes=num_classes) + ] + + # test assertion + with pytest.raises(AssertionError): + rand_mix = RandomBatchAugment(blending_augs, [0.5, 0.6]) + + # mixup, cutmix + rand_mix = RandomBatchAugment(blending_augs, probs=None) + assert rand_mix.probs is None + + # mixup, cutmix and None + probs = [0.5, 0.4] + rand_mix = RandomBatchAugment(blending_augs, probs) + + np.testing.assert_allclose(rand_mix.probs[-1], 0.1) + + # test call + imgs = torch.randn(4, 4, 3, 32, 32) # NCHW imgs + mixed_imgs, mixed_label = rand_mix(imgs, label) + assert mixed_imgs.shape == torch.Size((4, 4, 3, 32, 32)) + assert len(mixed_label) == 4 + + imgs = torch.randn(4, 4, 2, 3, 32, 32) # NCTHW imgs + label = get_label(torch.randint(0, num_classes, (4, ))) + mixed_imgs, mixed_label = rand_mix(imgs, label) + assert mixed_imgs.shape == torch.Size((4, 4, 2, 3, 32, 32)) + assert len(mixed_label) == 4 From 25072a145b8d646c57685656b0fb9b5ad11bb955 Mon Sep 17 00:00:00 2001 From: lilin Date: Fri, 21 Oct 2022 15:04:33 +0800 Subject: [PATCH 2/8] [feat] support mvit --- configs/_base_/models/mvit_small.py | 12 +- configs/recognition/mvit/README.md | 24 ++-- configs/recognition/mvit/metafile.yml | 115 ++++++++++++++++++ .../mvit-base-p244_32x3x1_kinetics400-rgb.py | 14 ++- .../mvit/mvit-base-p244_u32_sthv2-rgb.py | 12 +- .../mvit-large-p244_40x3x1_kinetics400-rgb.py | 11 ++ .../mvit/mvit-large-p244_u40_sthv2-rgb.py | 12 +- .../mvit-small-p244_16x4x1_kinetics400-rgb.py | 13 ++ .../mvit/mvit-small-p244_u16_sthv2-rgb.py | 4 +- mmaction/datasets/transforms/loading.py | 74 ----------- mmaction/models/backbones/mvit.py | 34 ++++-- mmaction/models/heads/__init__.py | 4 +- mmaction/models/heads/mvit_head.py | 9 +- tests/models/backbones/test_mvit.py | 12 +- tests/models/heads/test_mvit_head.py | 32 +++++ 15 files changed, 260 insertions(+), 122 deletions(-) create mode 100644 configs/recognition/mvit/metafile.yml create mode 100644 tests/models/heads/test_mvit_head.py diff --git a/configs/_base_/models/mvit_small.py b/configs/_base_/models/mvit_small.py index 727df37c38..d6a94daa23 100644 --- a/configs/_base_/models/mvit_small.py +++ b/configs/_base_/models/mvit_small.py @@ -3,17 +3,11 @@ backbone=dict(type='MViT', arch='small', drop_path_rate=0.2), data_preprocessor=dict( type='ActionDataPreprocessor', - mean=[114.75, 114.75, 114.75], - std=[57.375, 57.375, 57.375], - blending=dict( - type='RandomBatchAugment', - augments=[ - dict(type='MixupBlending', alpha=0.8, num_classes=400), - dict(type='CutmixBlending', alpha=1, num_classes=400) - ]), + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], format_shape='NCTHW'), cls_head=dict( - type='MVitHead', + type='MViTHead', in_channels=768, num_classes=400, label_smooth_eps=0.1, diff --git a/configs/recognition/mvit/README.md b/configs/recognition/mvit/README.md index fdc694a128..ccd9611c2d 100644 --- a/configs/recognition/mvit/README.md +++ b/configs/recognition/mvit/README.md @@ -27,21 +27,21 @@ well as 86.1% on Kinetics-400 video classification. ### Kinetics-400 -| frame sampling strategy | resolution | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top1 acc | testing protocol | params | config | ckpt | -| :---------------------: | :------------: | :--------: | :----------: | :------: | :------: | :------------------------------: | :------------------------------: | :--------------: | :----: | :------------------: | :-----------------: | -| 16x4x1 | short-side 320 | MViTv2-S\* | From scratch | 81.1 | 94.7 | [81.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [94.6](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 1 crop | xx.xM | [config](/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) | -| 32x3x1 | short-side 320 | MViTv2-B\* | From scratch | 82.6 | 95.8 | [82.9](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [95.7](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 1 crop | xx.xM | [config](/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) | -| 40x3x1 | short-side 320 | MViTv2-L\* | From scratch | 85.4 | 96.2 | [86.1](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [97.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 3 crop | xx.xM | [config](/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) | +| frame sampling strategy | resolution | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top1 acc | testing protocol | FLOPs | params | config | ckpt | +| :---------------------: | :------------: | :--------: | :----------: | :------: | :------: | :-----------------------------: | :-----------------------------: | :--------------: | :---: | :----: | :-----------------: | :---------------: | +| 16x4x1 | short-side 320 | MViTv2-S\* | From scratch | 81.1 | 94.7 | [81.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [94.6](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 1 crop | 64G | 34.5M | [config](/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-small-p244_16x4x1_kinetics400-rgb_20221021-9ebaaeed.pth) | +| 32x3x1 | short-side 320 | MViTv2-B\* | From scratch | 82.6 | 95.8 | [82.9](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [95.7](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 1 crop | 225G | 51.2M | [config](/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-base-p244_32x3x1_kinetics400-rgb_20221021-f392cd2d.pth) | +| 40x3x1 | short-side 320 | MViTv2-L\* | From scratch | 85.4 | 96.2 | [86.1](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [97.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 5 clips x 3 crop | 2828G | 213M | [config](/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-large-p244_40x3x1_kinetics400-rgb_20221021-11fe1f97.pth) | ### Something-Something V2 -| frame sampling strategy | resolution | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top1 acc | testing protocol | params | config | ckpt | -| :---------------------: | :------------: | :--------: | :----------: | :------: | :------: | :------------------------------: | :------------------------------: | :--------------: | :----: | :------------------: | :-----------------: | -| uniform 16 | short-side 320 | MViTv2-S\* | K400 | 68.1 | 91.0 | [68.2](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [91.4](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crop | xx.xM | [config](/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) | -| uniform 32 | short-side 320 | MViTv2-B\* | K400 | 70.8 | 92.7 | [70.5](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [92.7](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crop | xx.xM | [config](/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) | -| uniform 40 | short-side 320 | MViTv2-L\* | IN21K + K400 | 73.2 | 94.0 | [73.3](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [94.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crop | xx.xM | [config](/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/) | +| frame sampling strategy | resolution | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top1 acc | testing protocol | FLOPs | params | config | ckpt | +| :---------------------: | :------------: | :--------: | :----------: | :------: | :------: | :----------------------------: | :-----------------------------: | :---------------: | :---: | :----: | :-----------------: | :---------------: | +| uniform 16 | short-side 320 | MViTv2-S\* | K400 | 68.1 | 91.0 | [68.2](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [91.4](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crops | 64G | 34.4M | [config](/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-small-p244_u16_sthv2-rgb_20221021-65ecae7d.pth) | +| uniform 32 | short-side 320 | MViTv2-B\* | K400 | 70.8 | 92.7 | [70.5](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [92.7](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crops | 225G | 51.1M | [config](/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-base-p244_u32_sthv2-rgb_20221021-d5de5da6.pth) | +| uniform 40 | short-side 320 | MViTv2-L\* | IN21K + K400 | 73.2 | 94.0 | [73.3](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | [94.0](https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md) | 1 clips x 3 crops | 2828G | 213M | [config](/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-large-p244_u40_sthv2-rgb_20221021-61696e07.pth) | -*Models with * are ported from the repo [SlowFast](https://github.com/facebookresearch/SlowFast/) and tested on our data. Currently, we only support the testing of X3D models, training will be available soon.* +*Models with * are ported from the repo [SlowFast](https://github.com/facebookresearch/SlowFast/) and tested on our data. Currently, we only support the testing of MViT models, training will be available soon.* 1. The values in columns named after "reference" are copied from paper 2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. @@ -59,7 +59,7 @@ python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] Example: test MViT model on Kinetics-400 dataset and dump the result to a pkl file. ```shell -python tools/test.py configs/recognition/mvit/mvit-small_16x4x1_kinetics400-rgb.py \ +python tools/test.py configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py \ checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` diff --git a/configs/recognition/mvit/metafile.yml b/configs/recognition/mvit/metafile.yml new file mode 100644 index 0000000000..c5d7107482 --- /dev/null +++ b/configs/recognition/mvit/metafile.yml @@ -0,0 +1,115 @@ +Collections: +- Name: MViT + README: configs/recognition/MViT/README.md + Paper: + URL: http://openaccess.thecvf.com//content/CVPR2022/papers/Li_MViTv2_Improved_Multiscale_Vision_Transformers_for_Classification_and_Detection_CVPR_2022_paper.pdf + Title: "MViTv2: Improved Multiscale Vision Transformers for Classification and Detection" + +Models: + - Name: mvit-small-p244_16x4x1_kinetics400-rgb + Config: configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py + In Collection: MViT + Metadata: + Architecture: MViT-small + Resolution: short-side 320 + Modality: RGB + Converted From: + Weights: https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md + Code: https://github.com/facebookresearch/SlowFast/ + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 81.1 + Top 5 Accuracy: 94.7 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-small-p244_16x4x1_kinetics400-rgb_20221021-9ebaaeed.pth + + - Name: mvit-base-p244_32x3x1_kinetics400-rgb + Config: configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py + In Collection: MViT + Metadata: + Architecture: MViT-base + Resolution: short-side 320 + Modality: RGB + Converted From: + Weights: https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md + Code: https://github.com/facebookresearch/SlowFast/ + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 81.1 + Top 5 Accuracy: 94.7 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-base-p244_32x3x1_kinetics400-rgb_20221021-f392cd2d.pth + + - Name: mvit-large-p244_40x3x1_kinetics400-rgb + Config: configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py + In Collection: MViT + Metadata: + Architecture: MViT-large + Resolution: short-side 446 + Modality: RGB + Converted From: + Weights: https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md + Code: https://github.com/facebookresearch/SlowFast/ + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 81.1 + Top 5 Accuracy: 94.7 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-large-p244_40x3x1_kinetics400-rgb_20221021-11fe1f97.pth + + - Name: mvit-small-p244_u16_sthv2-rgb + Config: configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py + In Collection: MViT + Metadata: + Architecture: MViT-small + Resolution: short-side 320 + Modality: RGB + Converted From: + Weights: https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md + Code: https://github.com/facebookresearch/SlowFast/ + Results: + - Dataset: SthV2 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 68.1 + Top 5 Accuracy: 91.0 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-small-p244_u16_sthv2-rgb_20221021-65ecae7d.pth + + - Name: mvit-base-p244_u32_sthv2-rgb + Config: configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py + In Collection: MViT + Metadata: + Architecture: MViT-small + Resolution: short-side 320 + Modality: RGB + Converted From: + Weights: https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md + Code: https://github.com/facebookresearch/SlowFast/ + Results: + - Dataset: SthV2 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 70.8 + Top 5 Accuracy: 92.7 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-base-p244_u32_sthv2-rgb_20221021-d5de5da6.pth + + - Name: mvit-large-p244_u40_sthv2-rgb + Config: configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py + In Collection: MViT + Metadata: + Architecture: MViT-small + Resolution: short-side 446 + Modality: RGB + Converted From: + Weights: https://github.com/facebookresearch/SlowFast/blob/main/projects/mvitv2/README.md + Code: https://github.com/facebookresearch/SlowFast/ + Results: + - Dataset: SthV2 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 73.2 + Top 5 Accuracy: 94.0 + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-large-p244_u40_sthv2-rgb_20221021-61696e07.pth diff --git a/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py b/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py index 93b33a9dc9..b1e186f195 100644 --- a/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py +++ b/configs/recognition/mvit/mvit-base-p244_32x3x1_kinetics400-rgb.py @@ -7,7 +7,19 @@ arch='base', temporal_size=32, drop_path_rate=0.3, - )) + ), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + blending=dict( + type='RandomBatchAugment', + augments=[ + dict(type='MixupBlending', alpha=0.8, num_classes=400), + dict(type='CutmixBlending', alpha=1, num_classes=400) + ]), + format_shape='NCTHW'), +) # dataset settings dataset_type = 'VideoDataset' diff --git a/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py b/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py index c719396f29..944e17440d 100644 --- a/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py +++ b/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py @@ -8,6 +8,17 @@ temporal_size=32, drop_path_rate=0.3, ), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + blending=dict( + type='RandomBatchAugment', + augments=[ + dict(type='MixupBlending', alpha=0.8, num_classes=174), + dict(type='CutmixBlending', alpha=1, num_classes=174) + ]), + format_shape='NCTHW'), cls_head=dict(num_classes=174)) # dataset settings @@ -34,7 +45,6 @@ op='RandAugment', magnitude=7, num_layers=4), - dict(type='Flip', flip_ratio=0.5), dict(type='RandomErasing', erase_prob=0.25, mode='rand'), dict(type='FormatShape', input_format='NCTHW'), dict(type='PackActionInputs') diff --git a/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py b/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py index 883d9f7ce5..8c93519914 100644 --- a/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py +++ b/configs/recognition/mvit/mvit-large-p244_40x3x1_kinetics400-rgb.py @@ -9,6 +9,17 @@ spatial_size=312, drop_path_rate=0.75, ), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + blending=dict( + type='RandomBatchAugment', + augments=[ + dict(type='MixupBlending', alpha=0.8, num_classes=400), + dict(type='CutmixBlending', alpha=1, num_classes=400) + ]), + format_shape='NCTHW'), cls_head=dict(in_channels=1152), test_cfg=dict(max_testing_views=5)) diff --git a/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py b/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py index c682571df6..9b47b27a10 100644 --- a/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py +++ b/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py @@ -9,6 +9,17 @@ spatial_size=312, drop_path_rate=0.75, ), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + blending=dict( + type='RandomBatchAugment', + augments=[ + dict(type='MixupBlending', alpha=0.8, num_classes=400), + dict(type='CutmixBlending', alpha=1, num_classes=400) + ]), + format_shape='NCTHW'), cls_head=dict(in_channels=1152, num_classes=174), test_cfg=dict(max_testing_views=5)) @@ -36,7 +47,6 @@ op='RandAugment', magnitude=7, num_layers=4), - dict(type='Flip', flip_ratio=0.5), dict(type='RandomErasing', erase_prob=0.25, mode='rand'), dict(type='FormatShape', input_format='NCTHW'), dict(type='PackActionInputs') diff --git a/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py b/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py index 0df0b835fa..4da89b5a4a 100644 --- a/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py +++ b/configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400-rgb.py @@ -2,6 +2,19 @@ '../../_base_/models/mvit_small.py', '../../_base_/default_runtime.py' ] +model = dict( + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + blending=dict( + type='RandomBatchAugment', + augments=[ + dict(type='MixupBlending', alpha=0.8, num_classes=400), + dict(type='CutmixBlending', alpha=1, num_classes=400) + ]), + format_shape='NCTHW'), ) + # dataset settings dataset_type = 'VideoDataset' data_root = 'data/kinetics400/videos_train' diff --git a/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py b/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py index 7327df2e11..23f404db53 100644 --- a/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py +++ b/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py @@ -28,7 +28,6 @@ op='RandAugment', magnitude=7, num_layers=4), - dict(type='Flip', flip_ratio=0.5), dict(type='RandomErasing', erase_prob=0.25, mode='rand'), dict(type='FormatShape', input_format='NCTHW'), dict(type='PackActionInputs') @@ -105,7 +104,8 @@ optim_wrapper = dict( type='AmpOptimWrapper', optimizer=dict( - type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05)) + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0)) param_scheduler = [ dict( diff --git a/mmaction/datasets/transforms/loading.py b/mmaction/datasets/transforms/loading.py index e756410dac..d050c40f4a 100644 --- a/mmaction/datasets/transforms/loading.py +++ b/mmaction/datasets/transforms/loading.py @@ -265,80 +265,6 @@ def __repr__(self): return repr_str -@TRANSFORMS.register_module() -class SampleFramesV2(SampleFrames): - """Sample frames from the video. - - Required keys are "total_frames", "start_index" , added or modified keys - are "frame_inds", "frame_interval" and "num_clips". - Args: - clip_len (int): Frames of each sampled output clip. - frame_interval (int): Temporal interval of adjacent sampled frames. - Default: 1. - num_clips (int): Number of clips to be sampled. Default: 1. - temporal_jitter (bool): Whether to apply temporal jittering. - Default: False. - out_of_bound_opt (str): The way to deal with out of bounds frame - indexes. Available options are 'loop', 'repeat_last'. - Default: 'loop'. - test_mode (bool): Store True when building test or validation dataset. - Default: False. - start_index (None): This argument is deprecated and moved to dataset - class (``BaseDataset``, ``VideoDatset``, ``RawframeDataset``, etc), - see this: https://github.com/open-mmlab/mmaction2/pull/89. - keep_tail_frames (bool): Whether to keep tail frames when sampling. - Default: False. - """ - - def __init__(self, - clip_len, - frame_interval=1, - num_clips=1, - temporal_jitter=False, - out_of_bound_opt='loop', - test_mode=False, - keep_tail_frames=False): - super().__init__(clip_len, frame_interval, num_clips, temporal_jitter, - False, out_of_bound_opt, test_mode, keep_tail_frames) - - def _get_train_clips(self, num_frames): - """Get clip offsets in train mode. - - Args: - num_frames (int): Total number of frame in the video. - Returns: - np.ndarray: Sampled frame indices in train mode. - """ - ori_clip_len = (self.clip_len - 1) * self.frame_interval + 1 - max_offset = max(num_frames - ori_clip_len, 0) - - num_segments = max(self.num_clips - 1, 1) - offset_between = max_offset / num_segments - clip_offsets = np.arange(self.num_clips) * offset_between - clip_offsets += np.random.uniform(0, offset_between, self.num_clips) - clip_offsets = np.round(clip_offsets).astype(np.int32) - return clip_offsets - - def _get_test_clips(self, num_frames): - """Get clip offsets in test mode. - - If the total number of frames is - not enough, it will return all zero indices. - Args: - num_frames (int): Total number of frame in the video. - Returns: - np.ndarray: Sampled frame indices in test mode. - """ - ori_clip_len = (self.clip_len - 1) * self.frame_interval + 1 - max_offset = max(num_frames - ori_clip_len, 0) - - num_segments = max(self.num_clips - 1, 1) - offset_between = max_offset / float(num_segments) - clip_offsets = np.arange(self.num_clips) * offset_between - clip_offsets = np.round(clip_offsets).astype(np.int32) - return clip_offsets - - @TRANSFORMS.register_module() class UniformSampleFrames(BaseTransform): """Uniformly sample frames from the video. diff --git a/mmaction/models/backbones/mvit.py b/mmaction/models/backbones/mvit.py index 7974767cfc..1fb6b36290 100644 --- a/mmaction/models/backbones/mvit.py +++ b/mmaction/models/backbones/mvit.py @@ -7,8 +7,10 @@ import torch.nn.functional as F from mmcv.cnn import build_activation_layer, build_norm_layer from mmcv.cnn.bricks import DropPath +from mmengine.logging import MMLogger from mmengine.model import BaseModule, ModuleList -from mmengine.model.weight_init import trunc_normal_ +from mmengine.model.weight_init import constant_init, trunc_normal_ +from mmengine.runner import load_checkpoint from mmengine.utils import to_3tuple from mmaction.registry import MODELS @@ -332,7 +334,6 @@ def init_weights(self): if not self.rel_pos_zero_init: trunc_normal_(self.rel_pos_h, std=0.02) trunc_normal_(self.rel_pos_w, std=0.02) - if not self.rel_pos_zero_init: trunc_normal_(self.rel_pos_t, std=0.02) def forward(self, x, in_size): @@ -672,10 +673,9 @@ def __init__(self, stride=(2, 4, 4), padding=(1, 3, 3)), init_cfg=None): - if pretrained: - init_cfg = dict(type='Pretrained', checkpoint=pretrained) super().__init__(init_cfg=init_cfg) + self.pretrained = pretrained if isinstance(arch, str): arch = arch.lower() assert arch in set(self.arch_zoo), \ @@ -793,13 +793,27 @@ def __init__(self, norm_layer = build_norm_layer(norm_cfg, out_dims)[1] self.add_module(f'norm{stage_index}', norm_layer) - def init_weights(self): - super().init_weights() + def init_weights(self, pretrained: Optional[str] = None) -> None: - if (isinstance(self.init_cfg, dict) - and self.init_cfg['type'] == 'Pretrained'): - # Suppress default init if use pretrained model. - return + def _init_weights(m): + if isinstance(m, (nn.Linear, nn.Conv2d, nn.Conv3d)): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + constant_init(m.bias, 0.02) + elif isinstance(m, nn.LayerNorm): + constant_init(m.bias, 0.02) + constant_init(m.weight, 1.0) + + if pretrained: + self.pretrained = pretrained + if isinstance(self.pretrained, str): + logger = MMLogger.get_current_instance() + logger.info(f'load model from: {self.pretrained}') + load_checkpoint(self, self.pretrained, strict=False, logger=logger) + elif self.pretrained is None: + self.apply(_init_weights) + else: + raise TypeError('pretrained must be a str or None') if self.use_abs_pos_embed: trunc_normal_(self.pos_embed, std=0.02) diff --git a/mmaction/models/heads/__init__.py b/mmaction/models/heads/__init__.py index 3395f96fe3..c803fc8561 100644 --- a/mmaction/models/heads/__init__.py +++ b/mmaction/models/heads/__init__.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from .base import BaseHead from .i3d_head import I3DHead -from .mvit_head import MVitHead +from .mvit_head import MViTHead from .slowfast_head import SlowFastHead from .stgcn_head import STGCNHead from .timesformer_head import TimeSformerHead @@ -15,5 +15,5 @@ __all__ = [ 'TSNHead', 'I3DHead', 'BaseHead', 'TSMHead', 'SlowFastHead', 'TPNHead', 'X3DHead', 'TRNHead', 'TimeSformerHead', 'STGCNHead', 'TSNAudioHead', - 'MVitHead' + 'MViTHead' ] diff --git a/mmaction/models/heads/mvit_head.py b/mmaction/models/heads/mvit_head.py index eac4d30266..c5df34ea17 100644 --- a/mmaction/models/heads/mvit_head.py +++ b/mmaction/models/heads/mvit_head.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from typing import List, Tuple -from mmengine.model.weight_init import trunc_normal_init +from mmengine.model.weight_init import constant_init, trunc_normal_init from torch import Tensor, nn from mmaction.registry import MODELS @@ -10,8 +10,8 @@ @MODELS.register_module() -class MVitHead(BaseHead): - """Classification head for TimeSformer. +class MViTHead(BaseHead): + """Classification head for Multi-scale ViT. Args: num_classes (int): Number of classes to be classified. @@ -42,7 +42,8 @@ def __init__(self, def init_weights(self) -> None: """Initiate the parameters from scratch.""" - trunc_normal_init(self.fc_cls, std=self.init_std) + trunc_normal_init(self.fc_cls.weight, std=self.init_std) + constant_init(self.fc_cls.bias, 0.02) def pre_logits(self, feats: Tuple[List[Tensor]]) -> Tensor: """The process before the final classification head. diff --git a/tests/models/backbones/test_mvit.py b/tests/models/backbones/test_mvit.py index 4ebdbc26db..633cf73872 100644 --- a/tests/models/backbones/test_mvit.py +++ b/tests/models/backbones/test_mvit.py @@ -92,7 +92,7 @@ def test_init_weights(self): self.assertFalse(torch.allclose(model.pos_embed, torch.tensor(0.))) def test_forward(self): - imgs = torch.randn(1, 3, 16, 224, 224) + imgs = torch.randn(1, 3, 6, 64, 64) cfg = deepcopy(self.cfg) model = MViT(**cfg) @@ -100,7 +100,7 @@ def test_forward(self): self.assertIsInstance(outs, tuple) self.assertEqual(len(outs), 1) patch_token, cls_token = outs[-1] - self.assertEqual(patch_token.shape, (1, 768, 8, 7, 7)) + self.assertEqual(patch_token.shape, (1, 768, 3, 2, 2)) # Test forward with multi out scales cfg = deepcopy(self.cfg) @@ -113,13 +113,13 @@ def test_forward(self): stride = 2**stage patch_token, cls_token = out self.assertEqual(patch_token.shape, - (1, 96 * stride, 8, 56 // stride, 56 // stride)) + (1, 96 * stride, 3, 16 // stride, 16 // stride)) self.assertEqual(cls_token.shape, (1, 96 * stride)) # Test forward with dynamic input size - imgs1 = torch.randn(1, 3, 16, 224, 224) - imgs2 = torch.randn(1, 3, 16, 256, 256) - imgs3 = torch.randn(1, 3, 16, 256, 309) + imgs1 = torch.randn(1, 3, 2, 64, 64) + imgs2 = torch.randn(1, 3, 2, 96, 96) + imgs3 = torch.randn(1, 3, 2, 96, 128) cfg = deepcopy(self.cfg) model = MViT(**cfg) for imgs in [imgs1, imgs2, imgs3]: diff --git a/tests/models/heads/test_mvit_head.py b/tests/models/heads/test_mvit_head.py new file mode 100644 index 0000000000..8f64f5bf06 --- /dev/null +++ b/tests/models/heads/test_mvit_head.py @@ -0,0 +1,32 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch +import torch.nn as nn + +from mmaction.models import MViTHead + + +class TestMViTHead(TestCase): + DEFAULT_ARGS = dict(in_channels=768, num_classes=5) + fake_feats = ([torch.rand(4, 768, 3, 2, 2), torch.rand(4, 768)], ) + + def test_init(self): + head = MViTHead(**self.DEFAULT_ARGS) + head.init_weights() + self.assertEqual(head.dropout.p, head.dropout_ratio) + self.assertIsInstance(head.fc_cls, nn.Linear) + self.assertEqual(head.num_classes, 5) + self.assertEqual(head.dropout_ratio, 0.5) + self.assertEqual(head.in_channels, 768) + self.assertEqual(head.init_std, 0.02) + + def test_pre_logits(self): + head = MViTHead(**self.DEFAULT_ARGS) + pre_logits = head.pre_logits(self.fake_feats) + self.assertIs(pre_logits, self.fake_feats[-1][1]) + + def test_forward(self): + head = MViTHead(**self.DEFAULT_ARGS) + cls_score = head(self.fake_feats) + self.assertEqual(cls_score.shape, (4, 5)) From c9304c3771d9aaedf94edca1fecc206309facd91 Mon Sep 17 00:00:00 2001 From: lilin Date: Tue, 22 Nov 2022 19:05:43 +0800 Subject: [PATCH 3/8] [doc] fix docstring --- mmaction/models/backbones/mvit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mmaction/models/backbones/mvit.py b/mmaction/models/backbones/mvit.py index 1fb6b36290..f52aa35688 100644 --- a/mmaction/models/backbones/mvit.py +++ b/mmaction/models/backbones/mvit.py @@ -607,7 +607,8 @@ class MViT(BaseModule): >>> register_all_modules() >>> >>> cfg = dict(type='MViT', arch='tiny', out_scales=[0, 1, 2, 3]) - >>> model = model = MODELS.build(cfg) + >>> model = MODELS.build(cfg) + >>> model.init_weights() >>> inputs = torch.rand(1, 3, 16, 224, 224) >>> outputs = model(inputs) >>> for i, output in enumerate(outputs): From 22f6600eb63054045503542e12fa9aa691b88553 Mon Sep 17 00:00:00 2001 From: lilin Date: Wed, 23 Nov 2022 14:52:49 +0800 Subject: [PATCH 4/8] add type hint --- mmaction/datasets/transforms/loading.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mmaction/datasets/transforms/loading.py b/mmaction/datasets/transforms/loading.py index d050c40f4a..f40c193d4a 100644 --- a/mmaction/datasets/transforms/loading.py +++ b/mmaction/datasets/transforms/loading.py @@ -290,11 +290,11 @@ class UniformSampleFrames(BaseTransform): """ def __init__(self, - clip_len, - num_clips=1, - test_mode=False, - seed=255, - out_of_bound_opt='loop'): + clip_len: int, + num_clips: int = 1, + test_mode: bool = False, + seed: int = 255, + out_of_bound_opt: str = 'loop') -> None: self.clip_len = clip_len self.num_clips = num_clips @@ -303,7 +303,7 @@ def __init__(self, self.out_of_bound_opt = out_of_bound_opt assert self.out_of_bound_opt in ['loop', 'repeat_frame'] - def _get_train_clips(self, num_frames): + def _get_train_clips(self, num_frames: int): """Uniformly sample indices for training clips. Args: @@ -333,7 +333,7 @@ def _get_train_clips(self, num_frames): inds = bst + offset return inds - def _get_test_clips(self, num_frames): + def _get_test_clips(self, num_frames: int): """Uniformly sample indices for testing clips. Args: @@ -380,7 +380,7 @@ def _get_test_clips(self, num_frames): inds = np.concatenate(all_inds) return inds - def _get_repeat_sample_clips(self, num_frames): + def _get_repeat_sample_clips(self, num_frames: int) -> np.array: """Repeat sample when video is shorter than clip_len Modified from https://github.com/facebookresearch/SlowFast/blob/64ab cc90ccfdcbb11cf91d6e525bed60e92a8796/slowfast/datasets/ssv2.py#L159. @@ -409,7 +409,7 @@ def _get_repeat_sample_clips(self, num_frames): return np.array(inds) - def transform(self, results): + def transform(self, results: dict): num_frames = results['total_frames'] if self.out_of_bound_opt == 'loop': From 08de55812f24a25922baa05e77d626dc29da8eb6 Mon Sep 17 00:00:00 2001 From: lilin Date: Wed, 23 Nov 2022 16:07:14 +0800 Subject: [PATCH 5/8] add type hint --- mmaction/datasets/transforms/loading.py | 17 ++- mmaction/models/backbones/mvit.py | 170 ++++++++++++------------ mmaction/models/heads/mvit_head.py | 3 + 3 files changed, 102 insertions(+), 88 deletions(-) diff --git a/mmaction/datasets/transforms/loading.py b/mmaction/datasets/transforms/loading.py index f40c193d4a..6ec61c0590 100644 --- a/mmaction/datasets/transforms/loading.py +++ b/mmaction/datasets/transforms/loading.py @@ -270,23 +270,32 @@ class UniformSampleFrames(BaseTransform): """Uniformly sample frames from the video. To sample an n-frame clip from the video. UniformSampleFrames basically - divide the video into n segments of equal length and randomly sample one + divides the video into n segments of equal length and randomly samples one frame from each segment. To make the testing results reproducible, a random seed is set during testing, to make the sampling results deterministic. - Required keys are "total_frames", "start_index" , added or modified keys - are "frame_inds", "clip_len", "frame_interval" and "num_clips". + Required keys: + + - total_frames + - start_index + + Added keys: + + - frame_inds + - clip_len + - frame_interval + - num_clips Args: clip_len (int): Frames of each sampled output clip. num_clips (int): Number of clips to be sampled. Default: 1. test_mode (bool): Store True when building test or validation dataset. Default: False. + seed (int): The random seed used during test time. Default: 255. out_of_bound_opt (str): The way to deal with out of bounds frame indexes. Available options are 'loop', 'repeat_frame'. Default: 'loop'. - seed (int): The random seed used during test time. Default: 255. """ def __init__(self, diff --git a/mmaction/models/backbones/mvit.py b/mmaction/models/backbones/mvit.py index f52aa35688..b3ce6e7427 100644 --- a/mmaction/models/backbones/mvit.py +++ b/mmaction/models/backbones/mvit.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional, Sequence +from typing import Dict, List, Optional, Sequence, Tuple, Union import numpy as np import torch @@ -17,11 +17,11 @@ from ..utils.embed import PatchEmbed3D -def resize_pos_embed(pos_embed, - src_shape, - dst_shape, - mode='trilinear', - num_extra_tokens=1): +def resize_pos_embed(pos_embed: torch.Tensor, + src_shape: Tuple[int], + dst_shape: Tuple[int], + mode: str = 'trilinear', + num_extra_tokens: int = 1) -> torch.Tensor: """Resize pos_embed weights. Args: @@ -63,7 +63,8 @@ def resize_pos_embed(pos_embed, return torch.cat((extra_tokens, dst_weight), dim=1) -def resize_decomposed_rel_pos(rel_pos, q_size, k_size): +def resize_decomposed_rel_pos(rel_pos: torch.Tensor, q_size: int, + k_size: int) -> torch.Tensor: """Get relative positional embeddings according to the relative positions of query and key sizes. @@ -100,14 +101,14 @@ def resize_decomposed_rel_pos(rel_pos, q_size, k_size): return resized[relative_coords.long()] -def add_decomposed_rel_pos(attn, - q, - q_shape, - k_shape, - rel_pos_h, - rel_pos_w, - rel_pos_t, - with_cls_token=False): +def add_decomposed_rel_pos(attn: torch.Tensor, + q: torch.Tensor, + q_shape: Sequence[int], + k_shape: Sequence[int], + rel_pos_h: torch.Tensor, + rel_pos_w: torch.Tensor, + rel_pos_t: torch.Tensor, + with_cls_token: bool = False) -> torch.Tensor: """Spatiotemporal Relative Positional Embeddings.""" sp_idx = 1 if with_cls_token else 0 B, num_heads, _, C = q.shape @@ -155,11 +156,11 @@ class MLP(BaseModule): """ def __init__(self, - in_channels, - hidden_channels=None, - out_channels=None, - act_cfg=dict(type='GELU'), - init_cfg=None): + in_channels: int, + hidden_channels: Optional[int] = None, + out_channels: Optional[int] = None, + act_cfg: Dict = dict(type='GELU'), + init_cfg: Optional[Dict] = None) -> None: super().__init__(init_cfg=init_cfg) out_channels = out_channels or in_channels hidden_channels = hidden_channels or in_channels @@ -167,7 +168,7 @@ def __init__(self, self.act = build_activation_layer(act_cfg) self.fc2 = nn.Linear(hidden_channels, out_channels) - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.fc1(x) x = self.act(x) x = self.fc2(x) @@ -176,9 +177,9 @@ def forward(self, x): def attention_pool(x: torch.Tensor, pool: nn.Module, - in_size: tuple, + in_size: Tuple[int], with_cls_token: bool = False, - norm: Optional[nn.Module] = None): + norm: Optional[nn.Module] = None) -> tuple: """Pooling the feature tokens. Args: @@ -260,20 +261,20 @@ class MultiScaleAttention(BaseModule): """ def __init__(self, - in_dims, - out_dims, - num_heads, - qkv_bias=True, - norm_cfg=dict(type='LN'), - pool_kernel=(3, 3, 3), - stride_q=(1, 1, 1), - stride_kv=(1, 1, 1), - rel_pos_embed=True, - residual_pooling=True, - input_size=None, - rel_pos_zero_init=False, - with_cls_token=True, - init_cfg=None): + in_dims: int, + out_dims: int, + num_heads: int, + qkv_bias: bool = True, + norm_cfg: Dict = dict(type='LN'), + pool_kernel: Tuple[int] = (3, 3, 3), + stride_q: Tuple[int] = (1, 1, 1), + stride_kv: Tuple[int] = (1, 1, 1), + rel_pos_embed: bool = True, + residual_pooling: bool = True, + input_size: Optional[Tuple[int]] = None, + rel_pos_zero_init: bool = False, + with_cls_token: bool = True, + init_cfg: Optional[dict] = None) -> None: super().__init__(init_cfg=init_cfg) self.num_heads = num_heads self.with_cls_token = with_cls_token @@ -322,7 +323,7 @@ def build_pooling(stride): self.rel_pos_t = nn.Parameter( torch.zeros(2 * input_size[0] - 1, head_dim)) - def init_weights(self): + def init_weights(self) -> None: """Weight initialization.""" super().init_weights() @@ -336,7 +337,7 @@ def init_weights(self): trunc_normal_(self.rel_pos_w, std=0.02) trunc_normal_(self.rel_pos_t, std=0.02) - def forward(self, x, in_size): + def forward(self, x: torch.Tensor, in_size: Tuple[int]) -> tuple: """Forward the MultiScaleAttention.""" B, N, _ = x.shape # (B, H*W, C) @@ -427,25 +428,25 @@ class MultiScaleBlock(BaseModule): def __init__( self, - in_dims, - out_dims, - num_heads, - mlp_ratio=4.0, - qkv_bias=True, - drop_path=0.0, - norm_cfg=dict(type='LN'), - act_cfg=dict(type='GELU'), - qkv_pool_kernel=(3, 3, 3), - stride_q=(1, 1, 1), - stride_kv=(1, 1, 1), - rel_pos_embed=True, - residual_pooling=True, - with_cls_token=True, - dim_mul_in_attention=True, - input_size=None, - rel_pos_zero_init=False, - init_cfg=None, - ): + in_dims: int, + out_dims: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = True, + drop_path: float = 0.0, + norm_cfg: Dict = dict(type='LN'), + act_cfg: Dict = dict(type='GELU'), + qkv_pool_kernel: Tuple = (3, 3, 3), + stride_q: Tuple = (1, 1, 1), + stride_kv: Tuple = (1, 1, 1), + rel_pos_embed: bool = True, + residual_pooling: bool = True, + with_cls_token: bool = True, + dim_mul_in_attention: bool = True, + input_size: Optional[Tuple[int]] = None, + rel_pos_zero_init: bool = False, + init_cfg: Optional[Dict] = None, + ) -> None: super().__init__(init_cfg=init_cfg) self.with_cls_token = with_cls_token self.in_dims = in_dims @@ -499,7 +500,7 @@ def __init__( self.pool_skip = None self.init_out_size = input_size - def forward(self, x, in_size): + def forward(self, x: torch.Tensor, in_size: Tuple[int]) -> tuple: x_norm = self.norm1(x) x_attn, out_size = self.attn(x_norm, in_size) @@ -647,33 +648,33 @@ class MViT(BaseModule): num_extra_tokens = 1 def __init__(self, - arch='base', - spatial_size=224, - temporal_size=16, - in_channels=3, - pretrained=None, - out_scales=-1, - drop_path_rate=0., - use_abs_pos_embed=False, - interpolate_mode='trilinear', - pool_kernel=(3, 3, 3), - dim_mul=2, - head_mul=2, - adaptive_kv_stride=(1, 8, 8), - rel_pos_embed=True, - residual_pooling=True, - dim_mul_in_attention=True, - with_cls_token=True, - output_cls_token=True, - rel_pos_zero_init=False, - mlp_ratio=4., - qkv_bias=True, - norm_cfg=dict(type='LN', eps=1e-6), - patch_cfg=dict( + arch: str = 'base', + spatial_size: int = 224, + temporal_size: int = 16, + in_channels: int = 3, + pretrained: Optional[str] = None, + out_scales: Union[int, Sequence[int]] = -1, + drop_path_rate: float = 0., + use_abs_pos_embed: bool = False, + interpolate_mode: str = 'trilinear', + pool_kernel: tuple = (3, 3, 3), + dim_mul: int = 2, + head_mul: int = 2, + adaptive_kv_stride: tuple = (1, 8, 8), + rel_pos_embed: bool = True, + residual_pooling: bool = True, + dim_mul_in_attention: bool = True, + with_cls_token: bool = True, + output_cls_token: bool = True, + rel_pos_zero_init: bool = False, + mlp_ratio: float = 4., + qkv_bias: bool = True, + norm_cfg: Dict = dict(type='LN', eps=1e-6), + patch_cfg: Dict = dict( kernel_size=(3, 7, 7), stride=(2, 4, 4), padding=(1, 3, 3)), - init_cfg=None): + init_cfg: Optional[Dict] = None) -> None: super().__init__(init_cfg=init_cfg) self.pretrained = pretrained @@ -819,7 +820,8 @@ def _init_weights(m): if self.use_abs_pos_embed: trunc_normal_(self.pos_embed, std=0.02) - def forward(self, x): + def forward(self, x: torch.Tensor) ->\ + Tuple[Union[torch.Tensor, List[torch.Tensor]]]: """Forward the MViT.""" B = x.shape[0] x, patch_resolution = self.patch_embed(x) diff --git a/mmaction/models/heads/mvit_head.py b/mmaction/models/heads/mvit_head.py index c5df34ea17..3797bb616d 100644 --- a/mmaction/models/heads/mvit_head.py +++ b/mmaction/models/heads/mvit_head.py @@ -13,6 +13,9 @@ class MViTHead(BaseHead): """Classification head for Multi-scale ViT. + A PyTorch implement of : `MViTv2: Improved Multiscale Vision Transformers + for Classification and Detection `_ + Args: num_classes (int): Number of classes to be classified. in_channels (int): Number of channels in input feature. From c7dea18436671e04feeb6627035e82a318a38ad9 Mon Sep 17 00:00:00 2001 From: lilin Date: Wed, 23 Nov 2022 16:48:11 +0800 Subject: [PATCH 6/8] fix init_cfg for mvit --- mmaction/models/backbones/mvit.py | 101 ++++++++++++++---------------- 1 file changed, 47 insertions(+), 54 deletions(-) diff --git a/mmaction/models/backbones/mvit.py b/mmaction/models/backbones/mvit.py index b3ce6e7427..95f917f136 100644 --- a/mmaction/models/backbones/mvit.py +++ b/mmaction/models/backbones/mvit.py @@ -7,10 +7,8 @@ import torch.nn.functional as F from mmcv.cnn import build_activation_layer, build_norm_layer from mmcv.cnn.bricks import DropPath -from mmengine.logging import MMLogger from mmengine.model import BaseModule, ModuleList -from mmengine.model.weight_init import constant_init, trunc_normal_ -from mmengine.runner import load_checkpoint +from mmengine.model.weight_init import trunc_normal_ from mmengine.utils import to_3tuple from mmaction.registry import MODELS @@ -160,7 +158,7 @@ def __init__(self, hidden_channels: Optional[int] = None, out_channels: Optional[int] = None, act_cfg: Dict = dict(type='GELU'), - init_cfg: Optional[Dict] = None) -> None: + init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None: super().__init__(init_cfg=init_cfg) out_channels = out_channels or in_channels hidden_channels = hidden_channels or in_channels @@ -598,8 +596,12 @@ class MViT(BaseModule): ``dict(kernel_size=(3, 7, 7), stride=(2, 4, 4), padding=(1, 3, 3))``. - init_cfg (dict, optional): The Config for initialization. - Defaults to None. + init_cfg (dict, optional): The Config for initialization. Defaults to + ``[ + dict(type='TruncNormal', layer=['Conv2d', 'Conv3d'], std=0.02), + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.02), + ]`` Examples: >>> import torch @@ -647,37 +649,42 @@ class MViT(BaseModule): } num_extra_tokens = 1 - def __init__(self, - arch: str = 'base', - spatial_size: int = 224, - temporal_size: int = 16, - in_channels: int = 3, - pretrained: Optional[str] = None, - out_scales: Union[int, Sequence[int]] = -1, - drop_path_rate: float = 0., - use_abs_pos_embed: bool = False, - interpolate_mode: str = 'trilinear', - pool_kernel: tuple = (3, 3, 3), - dim_mul: int = 2, - head_mul: int = 2, - adaptive_kv_stride: tuple = (1, 8, 8), - rel_pos_embed: bool = True, - residual_pooling: bool = True, - dim_mul_in_attention: bool = True, - with_cls_token: bool = True, - output_cls_token: bool = True, - rel_pos_zero_init: bool = False, - mlp_ratio: float = 4., - qkv_bias: bool = True, - norm_cfg: Dict = dict(type='LN', eps=1e-6), - patch_cfg: Dict = dict( - kernel_size=(3, 7, 7), - stride=(2, 4, 4), - padding=(1, 3, 3)), - init_cfg: Optional[Dict] = None) -> None: + def __init__( + self, + arch: str = 'base', + spatial_size: int = 224, + temporal_size: int = 16, + in_channels: int = 3, + pretrained: Optional[str] = None, + out_scales: Union[int, Sequence[int]] = -1, + drop_path_rate: float = 0., + use_abs_pos_embed: bool = False, + interpolate_mode: str = 'trilinear', + pool_kernel: tuple = (3, 3, 3), + dim_mul: int = 2, + head_mul: int = 2, + adaptive_kv_stride: tuple = (1, 8, 8), + rel_pos_embed: bool = True, + residual_pooling: bool = True, + dim_mul_in_attention: bool = True, + with_cls_token: bool = True, + output_cls_token: bool = True, + rel_pos_zero_init: bool = False, + mlp_ratio: float = 4., + qkv_bias: bool = True, + norm_cfg: Dict = dict(type='LN', eps=1e-6), + patch_cfg: Dict = dict( + kernel_size=(3, 7, 7), stride=(2, 4, 4), padding=(1, 3, 3)), + init_cfg: Optional[Union[Dict, List[Dict]]] = [ + dict(type='TruncNormal', layer=['Conv2d', 'Conv3d'], std=0.02), + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.02), + ] + ) -> None: + if pretrained: + self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) super().__init__(init_cfg=init_cfg) - self.pretrained = pretrained if isinstance(arch, str): arch = arch.lower() assert arch in set(self.arch_zoo), \ @@ -796,26 +803,12 @@ def __init__(self, self.add_module(f'norm{stage_index}', norm_layer) def init_weights(self, pretrained: Optional[str] = None) -> None: + super().init_weights() - def _init_weights(m): - if isinstance(m, (nn.Linear, nn.Conv2d, nn.Conv3d)): - trunc_normal_(m.weight, std=0.02) - if isinstance(m, nn.Linear) and m.bias is not None: - constant_init(m.bias, 0.02) - elif isinstance(m, nn.LayerNorm): - constant_init(m.bias, 0.02) - constant_init(m.weight, 1.0) - - if pretrained: - self.pretrained = pretrained - if isinstance(self.pretrained, str): - logger = MMLogger.get_current_instance() - logger.info(f'load model from: {self.pretrained}') - load_checkpoint(self, self.pretrained, strict=False, logger=logger) - elif self.pretrained is None: - self.apply(_init_weights) - else: - raise TypeError('pretrained must be a str or None') + if (isinstance(self.init_cfg, dict) + and self.init_cfg['type'] == 'Pretrained'): + # Suppress default init if use pretrained model. + return if self.use_abs_pos_embed: trunc_normal_(self.pos_embed, std=0.02) From 8f123b4d3124436e3e862d19d4a3796f480f6963 Mon Sep 17 00:00:00 2001 From: lilin Date: Wed, 23 Nov 2022 20:00:54 +0800 Subject: [PATCH 7/8] fix ut --- mmaction/models/recognizers/recognizer3d.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mmaction/models/recognizers/recognizer3d.py b/mmaction/models/recognizers/recognizer3d.py index bb7e250157..81b86534ac 100644 --- a/mmaction/models/recognizers/recognizer3d.py +++ b/mmaction/models/recognizers/recognizer3d.py @@ -83,7 +83,10 @@ def recursively_cat(feats): return tuple(out_feats) - x = recursively_cat(feats) + if isinstance(feats[0], tuple): + x = recursively_cat(feats) + else: + x = torch.cat(feats) else: x = self.backbone(inputs) if self.with_neck: From 6f78baceaab0d0b3eb45a650987a92b575b2e461 Mon Sep 17 00:00:00 2001 From: lilin Date: Tue, 29 Nov 2022 15:22:49 +0800 Subject: [PATCH 8/8] split uniform sample --- .../mvit/mvit-base-p244_u32_sthv2-rgb.py | 17 +-- .../mvit/mvit-large-p244_u40_sthv2-rgb.py | 17 +-- .../mvit/mvit-small-p244_u16_sthv2-rgb.py | 17 +-- mmaction/datasets/transforms/__init__.py | 14 +- mmaction/datasets/transforms/loading.py | 130 ++--------------- mmaction/datasets/transforms/pose_loading.py | 135 ++++++++++++++++++ .../datasets/transforms/test_pose_loading.py | 88 +++++++++++- tests/datasets/transforms/test_sampling.py | 87 +---------- 8 files changed, 254 insertions(+), 251 deletions(-) diff --git a/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py b/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py index 944e17440d..c954b60b54 100644 --- a/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py +++ b/configs/recognition/mvit/mvit-base-p244_u32_sthv2-rgb.py @@ -32,10 +32,7 @@ file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), - dict( - type='UniformSampleFrames', - clip_len=32, - out_of_bound_opt='repeat_frame'), + dict(type='UniformSample', clip_len=32), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), dict(type='RandomResizedCrop'), @@ -51,11 +48,7 @@ ] val_pipeline = [ dict(type='DecordInit', **file_client_args), - dict( - type='UniformSampleFrames', - clip_len=32, - out_of_bound_opt='repeat_frame', - test_mode=True), + dict(type='UniformSample', clip_len=32, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), dict(type='CenterCrop', crop_size=224), @@ -64,11 +57,7 @@ ] test_pipeline = [ dict(type='DecordInit', **file_client_args), - dict( - type='UniformSampleFrames', - clip_len=32, - out_of_bound_opt='repeat_frame', - test_mode=True), + dict(type='UniformSample', clip_len=32, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 224)), dict(type='ThreeCrop', crop_size=224), diff --git a/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py b/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py index 9b47b27a10..b3fde41a78 100644 --- a/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py +++ b/configs/recognition/mvit/mvit-large-p244_u40_sthv2-rgb.py @@ -34,10 +34,7 @@ file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), - dict( - type='UniformSampleFrames', - clip_len=40, - out_of_bound_opt='repeat_frame'), + dict(type='UniformSample', clip_len=40), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), dict(type='RandomResizedCrop'), @@ -53,11 +50,7 @@ ] val_pipeline = [ dict(type='DecordInit', **file_client_args), - dict( - type='UniformSampleFrames', - clip_len=40, - out_of_bound_opt='repeat_frame', - test_mode=True), + dict(type='UniformSample', clip_len=40, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), dict(type='CenterCrop', crop_size=224), @@ -66,11 +59,7 @@ ] test_pipeline = [ dict(type='DecordInit', **file_client_args), - dict( - type='UniformSampleFrames', - clip_len=40, - out_of_bound_opt='repeat_frame', - test_mode=True), + dict(type='UniformSample', clip_len=40, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 224)), dict(type='ThreeCrop', crop_size=224), diff --git a/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py b/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py index 23f404db53..08934b9a5e 100644 --- a/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py +++ b/configs/recognition/mvit/mvit-small-p244_u16_sthv2-rgb.py @@ -15,10 +15,7 @@ file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), - dict( - type='UniformSampleFrames', - clip_len=16, - out_of_bound_opt='repeat_frame'), + dict(type='UniformSample', clip_len=16), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), dict(type='RandomResizedCrop'), @@ -34,11 +31,7 @@ ] val_pipeline = [ dict(type='DecordInit', **file_client_args), - dict( - type='UniformSampleFrames', - clip_len=16, - out_of_bound_opt='repeat_frame', - test_mode=True), + dict(type='UniformSample', clip_len=16, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), dict(type='CenterCrop', crop_size=224), @@ -47,11 +40,7 @@ ] test_pipeline = [ dict(type='DecordInit', **file_client_args), - dict( - type='UniformSampleFrames', - clip_len=16, - out_of_bound_opt='repeat_frame', - test_mode=True), + dict(type='UniformSample', clip_len=16, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 224)), dict(type='ThreeCrop', crop_size=224), diff --git a/mmaction/datasets/transforms/__init__.py b/mmaction/datasets/transforms/__init__.py index 7aaaee894d..09e0111e4c 100644 --- a/mmaction/datasets/transforms/__init__.py +++ b/mmaction/datasets/transforms/__init__.py @@ -10,9 +10,9 @@ LoadProposals, OpenCVDecode, OpenCVInit, PIMSDecode, PIMSInit, PyAVDecode, PyAVDecodeMotionVector, PyAVInit, RawFrameDecode, SampleAVAFrames, SampleFrames, - UniformSampleFrames, UntrimmedSampleFrames) + UniformSample, UntrimmedSampleFrames) from .pose_loading import (GeneratePoseTarget, LoadKineticsPose, - PaddingWithLoop, PoseDecode) + PaddingWithLoop, PoseDecode, UniformSampleFrames) from .processing import (AudioAmplify, CenterCrop, ColorJitter, Flip, Fuse, MelSpectrogram, MultiScaleCrop, PoseCompact, RandomCrop, RandomRescale, RandomResizedCrop, Resize, @@ -30,9 +30,9 @@ 'AudioAmplify', 'MelSpectrogram', 'AudioDecode', 'FormatAudioShape', 'LoadAudioFeature', 'AudioFeatureSelector', 'AudioDecodeInit', 'ImageDecode', 'BuildPseudoClip', 'RandomRescale', 'PIMSDecode', - 'PyAVDecodeMotionVector', 'UniformSampleFrames', 'PoseDecode', - 'LoadKineticsPose', 'GeneratePoseTarget', 'PIMSInit', 'FormatGCNInput', - 'PaddingWithLoop', 'ArrayDecode', 'JointToBone', 'PackActionInputs', - 'PackLocalizationInputs', 'ImgAug', 'TorchVisionWrapper', - 'PytorchVideoWrapper', 'PoseCompact' + 'PyAVDecodeMotionVector', 'UniformSample', 'UniformSampleFrames', + 'PoseDecode', 'LoadKineticsPose', 'GeneratePoseTarget', 'PIMSInit', + 'FormatGCNInput', 'PaddingWithLoop', 'ArrayDecode', 'JointToBone', + 'PackActionInputs', 'PackLocalizationInputs', 'ImgAug', + 'TorchVisionWrapper', 'PytorchVideoWrapper', 'PoseCompact' ] diff --git a/mmaction/datasets/transforms/loading.py b/mmaction/datasets/transforms/loading.py index 6ec61c0590..e0b5ce75a6 100644 --- a/mmaction/datasets/transforms/loading.py +++ b/mmaction/datasets/transforms/loading.py @@ -266,14 +266,15 @@ def __repr__(self): @TRANSFORMS.register_module() -class UniformSampleFrames(BaseTransform): - """Uniformly sample frames from the video. +class UniformSample(BaseTransform): + """Uniformly sample frames from the video. Currently used for Something- + Something V2 dataset. Modified from + https://github.com/facebookresearch/SlowFast/blob/64a + bcc90ccfdcbb11cf91d6e525bed60e92a8796/slowfast/datasets/ssv2.py#L159. To sample an n-frame clip from the video. UniformSampleFrames basically divides the video into n segments of equal length and randomly samples one - frame from each segment. To make the testing results reproducible, a - random seed is set during testing, to make the sampling results - deterministic. + frame from each segment. Required keys: @@ -292,113 +293,23 @@ class UniformSampleFrames(BaseTransform): num_clips (int): Number of clips to be sampled. Default: 1. test_mode (bool): Store True when building test or validation dataset. Default: False. - seed (int): The random seed used during test time. Default: 255. - out_of_bound_opt (str): The way to deal with out of bounds frame - indexes. Available options are 'loop', 'repeat_frame'. - Default: 'loop'. """ def __init__(self, clip_len: int, num_clips: int = 1, - test_mode: bool = False, - seed: int = 255, - out_of_bound_opt: str = 'loop') -> None: + test_mode: bool = False) -> None: self.clip_len = clip_len self.num_clips = num_clips self.test_mode = test_mode - self.seed = seed - self.out_of_bound_opt = out_of_bound_opt - assert self.out_of_bound_opt in ['loop', 'repeat_frame'] - - def _get_train_clips(self, num_frames: int): - """Uniformly sample indices for training clips. - - Args: - num_frames (int): The number of frames. - """ - - assert self.num_clips == 1 - if num_frames < self.clip_len: - start = np.random.randint(0, num_frames) - inds = np.arange(start, start + self.clip_len) - elif self.clip_len <= num_frames < 2 * self.clip_len: - basic = np.arange(self.clip_len) - inds = np.random.choice( - self.clip_len + 1, num_frames - self.clip_len, replace=False) - offset = np.zeros(self.clip_len + 1, dtype=np.int32) - offset[inds] = 1 - offset = np.cumsum(offset) - inds = basic + offset[:-1] - else: - bids = np.array([ - i * num_frames // self.clip_len - for i in range(self.clip_len + 1) - ]) - bsize = np.diff(bids) - bst = bids[:self.clip_len] - offset = np.random.randint(bsize) - inds = bst + offset - return inds - - def _get_test_clips(self, num_frames: int): - """Uniformly sample indices for testing clips. - Args: - num_frames (int): The number of frames. - """ - - np.random.seed(self.seed) - if num_frames < self.clip_len: - # Then we use a simple strategy - if num_frames < self.num_clips: - start_inds = list(range(self.num_clips)) - else: - start_inds = [ - i * num_frames // self.num_clips - for i in range(self.num_clips) - ] - inds = np.concatenate( - [np.arange(i, i + self.clip_len) for i in start_inds]) - elif self.clip_len <= num_frames < self.clip_len * 2: - all_inds = [] - for i in range(self.num_clips): - basic = np.arange(self.clip_len) - inds = np.random.choice( - self.clip_len + 1, - num_frames - self.clip_len, - replace=False) - offset = np.zeros(self.clip_len + 1, dtype=np.int32) - offset[inds] = 1 - offset = np.cumsum(offset) - inds = basic + offset[:-1] - all_inds.append(inds) - inds = np.concatenate(all_inds) - else: - bids = np.array([ - i * num_frames // self.clip_len - for i in range(self.clip_len + 1) - ]) - bsize = np.diff(bids) - bst = bids[:self.clip_len] - all_inds = [] - for i in range(self.num_clips): - offset = np.random.randint(bsize) - all_inds.append(bst + offset) - inds = np.concatenate(all_inds) - return inds - - def _get_repeat_sample_clips(self, num_frames: int) -> np.array: - """Repeat sample when video is shorter than clip_len Modified from - https://github.com/facebookresearch/SlowFast/blob/64ab - cc90ccfdcbb11cf91d6e525bed60e92a8796/slowfast/datasets/ssv2.py#L159. - - When video frames is shorter than target clip len, this strategy would - repeat sample frame, rather than loop sample in 'loop' mode. - In test mode, this strategy would sample the middle frame of each - segment, rather than set a random seed, and therefore only support - sample 1 clip. + def _get_sample_clips(self, num_frames: int) -> np.array: + """When video frames is shorter than target clip len, this strategy + would repeat sample frame, rather than loop sample in 'loop' mode. In + test mode, this strategy would sample the middle frame of each segment, + rather than set a random seed, and therefore only support sample 1 + clip. Args: num_frames (int): Total number of frame in the video. @@ -421,17 +332,7 @@ def _get_repeat_sample_clips(self, num_frames: int) -> np.array: def transform(self, results: dict): num_frames = results['total_frames'] - if self.out_of_bound_opt == 'loop': - if self.test_mode: - inds = self._get_test_clips(num_frames) - else: - inds = self._get_train_clips(num_frames) - inds = np.mod(inds, num_frames) - elif self.out_of_bound_opt == 'repeat_frame': - inds = self._get_repeat_sample_clips(num_frames) - else: - raise ValueError('Illegal out_of_bound option.') - + inds = self._get_sample_clips(num_frames) start_index = results['start_index'] inds = inds + start_index @@ -445,8 +346,7 @@ def __repr__(self): repr_str = (f'{self.__class__.__name__}(' f'clip_len={self.clip_len}, ' f'num_clips={self.num_clips}, ' - f'test_mode={self.test_mode}, ' - f'seed={self.seed})') + f'test_mode={self.test_mode}') return repr_str diff --git a/mmaction/datasets/transforms/pose_loading.py b/mmaction/datasets/transforms/pose_loading.py index 592850334f..58748eacb6 100644 --- a/mmaction/datasets/transforms/pose_loading.py +++ b/mmaction/datasets/transforms/pose_loading.py @@ -11,6 +11,141 @@ from .processing import Flip +@TRANSFORMS.register_module() +class UniformSampleFrames(BaseTransform): + """Uniformly sample frames from the video. + + To sample an n-frame clip from the video. UniformSampleFrames basically + divide the video into n segments of equal length and randomly sample one + frame from each segment. To make the testing results reproducible, a + random seed is set during testing, to make the sampling results + deterministic. + + Required keys are ``'total_frames'``, ``'start_index'`` , added or + modified keys are ``'frame_inds'``, ``'clip_len'``, + ``'frame_interval'`` and ``'num_clips'``. + + Args: + clip_len (int): Frames of each sampled output clip. + num_clips (int): Number of clips to be sampled. Defaults to 1. + test_mode (bool): Store True when building test or validation dataset. + Defaults to False. + seed (int): The random seed used during test time. Defaults to 255. + """ + + def __init__(self, clip_len, num_clips=1, test_mode=False, seed=255): + + self.clip_len = clip_len + self.num_clips = num_clips + self.test_mode = test_mode + self.seed = seed + + def _get_train_clips(self, num_frames, clip_len): + """Uniformly sample indices for training clips. + + Args: + num_frames (int): The number of frames. + clip_len (int): The length of the clip. + """ + + assert self.num_clips == 1 + if num_frames < clip_len: + start = np.random.randint(0, num_frames) + inds = np.arange(start, start + clip_len) + elif clip_len <= num_frames < 2 * clip_len: + basic = np.arange(clip_len) + inds = np.random.choice( + clip_len + 1, num_frames - clip_len, replace=False) + offset = np.zeros(clip_len + 1, dtype=np.int32) + offset[inds] = 1 + offset = np.cumsum(offset) + inds = basic + offset[:-1] + else: + bids = np.array( + [i * num_frames // clip_len for i in range(clip_len + 1)]) + bsize = np.diff(bids) + bst = bids[:clip_len] + offset = np.random.randint(bsize) + inds = bst + offset + return inds + + def _get_test_clips(self, num_frames, clip_len): + """Uniformly sample indices for testing clips. + + Args: + num_frames (int): The number of frames. + clip_len (int): The length of the clip. + """ + + np.random.seed(self.seed) + if num_frames < clip_len: + # Then we use a simple strategy + if num_frames < self.num_clips: + start_inds = list(range(self.num_clips)) + else: + start_inds = [ + i * num_frames // self.num_clips + for i in range(self.num_clips) + ] + inds = np.concatenate( + [np.arange(i, i + clip_len) for i in start_inds]) + elif clip_len <= num_frames < clip_len * 2: + all_inds = [] + for i in range(self.num_clips): + basic = np.arange(clip_len) + inds = np.random.choice( + clip_len + 1, num_frames - clip_len, replace=False) + offset = np.zeros(clip_len + 1, dtype=np.int32) + offset[inds] = 1 + offset = np.cumsum(offset) + inds = basic + offset[:-1] + all_inds.append(inds) + inds = np.concatenate(all_inds) + else: + bids = np.array( + [i * num_frames // clip_len for i in range(clip_len + 1)]) + bsize = np.diff(bids) + bst = bids[:clip_len] + all_inds = [] + for i in range(self.num_clips): + offset = np.random.randint(bsize) + all_inds.append(bst + offset) + inds = np.concatenate(all_inds) + return inds + + def transform(self, results): + """Perform the SampleFrames loading. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + num_frames = results['total_frames'] + + if self.test_mode: + inds = self._get_test_clips(num_frames, self.clip_len) + else: + inds = self._get_train_clips(num_frames, self.clip_len) + + inds = np.mod(inds, num_frames) + start_index = results['start_index'] + inds = inds + start_index + + results['frame_inds'] = inds.astype(np.int32) + results['clip_len'] = self.clip_len + results['frame_interval'] = None + results['num_clips'] = self.num_clips + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'clip_len={self.clip_len}, ' + f'num_clips={self.num_clips}, ' + f'test_mode={self.test_mode}, ' + f'seed={self.seed})') + return repr_str + + @TRANSFORMS.register_module() class PoseDecode(BaseTransform): """Load and decode pose with given indices. diff --git a/tests/datasets/transforms/test_pose_loading.py b/tests/datasets/transforms/test_pose_loading.py index fd7568798f..eeb2dad84c 100644 --- a/tests/datasets/transforms/test_pose_loading.py +++ b/tests/datasets/transforms/test_pose_loading.py @@ -10,11 +10,97 @@ from numpy.testing import assert_array_almost_equal, assert_array_equal from mmaction.datasets.transforms import (GeneratePoseTarget, LoadKineticsPose, - PaddingWithLoop, PoseDecode) + PaddingWithLoop, PoseDecode, + UniformSampleFrames) class TestPoseLoading: + @staticmethod + def test_uniform_sample_frames(): + results = dict(total_frames=64, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=1, test_mode=True, seed=0) + + assert str(sampling) == ('UniformSampleFrames(clip_len=8, ' + 'num_clips=1, test_mode=True, seed=0)') + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert_array_equal(sampling_results['frame_inds'], + np.array([4, 15, 21, 24, 35, 43, 51, 63])) + + results = dict(total_frames=15, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=1, test_mode=True, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert_array_equal(sampling_results['frame_inds'], + np.array([0, 2, 4, 6, 8, 9, 11, 13])) + + results = dict(total_frames=7, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=1, test_mode=True, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert_array_equal(sampling_results['frame_inds'], + np.array([0, 1, 2, 3, 4, 5, 6, 0])) + + results = dict(total_frames=7, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=8, test_mode=True, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 8 + assert len(sampling_results['frame_inds']) == 64 + + results = dict(total_frames=64, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=4, test_mode=True, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 4 + assert_array_equal( + sampling_results['frame_inds'], + np.array([ + 4, 15, 21, 24, 35, 43, 51, 63, 1, 11, 21, 26, 36, 47, 54, 56, + 0, 12, 18, 25, 38, 47, 55, 62, 0, 9, 21, 25, 37, 40, 49, 60 + ])) + + results = dict(total_frames=64, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=1, test_mode=False, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert len(sampling_results['frame_inds']) == 8 + + results = dict(total_frames=7, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=1, test_mode=False, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert len(sampling_results['frame_inds']) == 8 + + results = dict(total_frames=15, start_index=0) + sampling = UniformSampleFrames( + clip_len=8, num_clips=1, test_mode=False, seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == 8 + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert len(sampling_results['frame_inds']) == 8 + @staticmethod def test_pose_decode(): kp = np.random.random([1, 16, 17, 2]) diff --git a/tests/datasets/transforms/test_sampling.py b/tests/datasets/transforms/test_sampling.py index 9450682315..f4a5e457bd 100644 --- a/tests/datasets/transforms/test_sampling.py +++ b/tests/datasets/transforms/test_sampling.py @@ -9,8 +9,7 @@ from mmaction.datasets.transforms import (AudioFeatureSelector, DenseSampleFrames, SampleAVAFrames, - SampleFrames, UniformSampleFrames, - UntrimmedSampleFrames) + SampleFrames, UntrimmedSampleFrames) class BaseTestLoading: @@ -402,90 +401,6 @@ def check_monotonous(arr): assert np.max(sample_frames_results['frame_inds']) <= 40 assert np.min(sample_frames_results['frame_inds']) >= 1 - def test_uniform_sample_frames(self): - results = dict(total_frames=64, start_index=0) - sampling = UniformSampleFrames( - clip_len=8, num_clips=1, test_mode=True, seed=0) - - assert str(sampling) == ('UniformSampleFrames(clip_len=8, ' - 'num_clips=1, test_mode=True, seed=0)') - sampling_results = sampling(results) - assert sampling_results['clip_len'] == 8 - assert sampling_results['frame_interval'] is None - assert sampling_results['num_clips'] == 1 - assert_array_equal(sampling_results['frame_inds'], - np.array([4, 15, 21, 24, 35, 43, 51, 63])) - - results = dict(total_frames=15, start_index=0) - sampling = UniformSampleFrames( - clip_len=8, num_clips=1, test_mode=True, seed=0) - sampling_results = sampling(results) - assert sampling_results['clip_len'] == 8 - assert sampling_results['frame_interval'] is None - assert sampling_results['num_clips'] == 1 - assert_array_equal(sampling_results['frame_inds'], - np.array([0, 2, 4, 6, 8, 9, 11, 13])) - - results = dict(total_frames=7, start_index=0) - sampling = UniformSampleFrames( - clip_len=8, num_clips=1, test_mode=True, seed=0) - sampling_results = sampling(results) - assert sampling_results['clip_len'] == 8 - assert sampling_results['frame_interval'] is None - assert sampling_results['num_clips'] == 1 - assert_array_equal(sampling_results['frame_inds'], - np.array([0, 1, 2, 3, 4, 5, 6, 0])) - - results = dict(total_frames=7, start_index=0) - sampling = UniformSampleFrames( - clip_len=8, num_clips=8, test_mode=True, seed=0) - sampling_results = sampling(results) - assert sampling_results['clip_len'] == 8 - assert sampling_results['frame_interval'] is None - assert sampling_results['num_clips'] == 8 - assert len(sampling_results['frame_inds']) == 64 - - results = dict(total_frames=64, start_index=0) - sampling = UniformSampleFrames( - clip_len=8, num_clips=4, test_mode=True, seed=0) - sampling_results = sampling(results) - assert sampling_results['clip_len'] == 8 - assert sampling_results['frame_interval'] is None - assert sampling_results['num_clips'] == 4 - assert_array_equal( - sampling_results['frame_inds'], - np.array([ - 4, 15, 21, 24, 35, 43, 51, 63, 1, 11, 21, 26, 36, 47, 54, 56, - 0, 12, 18, 25, 38, 47, 55, 62, 0, 9, 21, 25, 37, 40, 49, 60 - ])) - - results = dict(total_frames=64, start_index=0) - sampling = UniformSampleFrames( - clip_len=8, num_clips=1, test_mode=False, seed=0) - sampling_results = sampling(results) - assert sampling_results['clip_len'] == 8 - assert sampling_results['frame_interval'] is None - assert sampling_results['num_clips'] == 1 - assert len(sampling_results['frame_inds']) == 8 - - results = dict(total_frames=7, start_index=0) - sampling = UniformSampleFrames( - clip_len=8, num_clips=1, test_mode=False, seed=0) - sampling_results = sampling(results) - assert sampling_results['clip_len'] == 8 - assert sampling_results['frame_interval'] is None - assert sampling_results['num_clips'] == 1 - assert len(sampling_results['frame_inds']) == 8 - - results = dict(total_frames=15, start_index=0) - sampling = UniformSampleFrames( - clip_len=8, num_clips=1, test_mode=False, seed=0) - sampling_results = sampling(results) - assert sampling_results['clip_len'] == 8 - assert sampling_results['frame_interval'] is None - assert sampling_results['num_clips'] == 1 - assert len(sampling_results['frame_inds']) == 8 - def test_dense_sample_frames(self): target_keys = [ 'frame_inds', 'clip_len', 'frame_interval', 'num_clips',