From ece0dea773f349e124a58173ed74f367569b97a3 Mon Sep 17 00:00:00 2001 From: Dai-Wenxun Date: Tue, 6 Dec 2022 16:40:45 +0800 Subject: [PATCH 01/19] update c2d --- configs/recognition/c2d/README.md | 16 ++++++++-------- configs/recognition/c2d/metafile.yml | 24 ++++++++++++------------ 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/configs/recognition/c2d/README.md b/configs/recognition/c2d/README.md index 22bfd6ed86..af905ef0c4 100644 --- a/configs/recognition/c2d/README.md +++ b/configs/recognition/c2d/README.md @@ -21,18 +21,18 @@ Both convolutional and recurrent operations are building blocks that process one ### Kinetics-400 -| frame sampling strategy | scheduler | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | gpu_mem(M) | FLOPs | params | config | ckpt | log | -| :---------------------: | :-------: | :------------: | :--: | :-----------: | :------: | :------: | :------: | :---------------------: | :---------------------: | :----------------: | :--------: | :---: | :----: | :---------: | :-------: | :------: | -| 8x8x1 | MultiStep | short-side 320 | 8 | ResNet50
| ImageNet | 73.16 | 90.88 | 67.2
[\[PySlowFast\]](https://github.com/facebookresearch/SlowFast/blob/main/MODEL_ZOO.md#kinetics-400-and-600) | 87.8
[\[PySlowFast\]](https://github.com/facebookresearch/SlowFast/blob/main/MODEL_ZOO.md#kinetics-400-and-600) | 10 clips x 3 crops | 21547 | 33G | 24.3M | [config](/configs/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb_20221027-e0227b22.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.log) | -| 8x8x1 | MultiStep | short-side 320 | 8 | ResNet101
| ImageNet | 74.57 | 91.60 | x | x | 10 clips x 3 crops | 31836 | 63G | 43.3M | [config](/configs/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb_20221027-557bd8bc.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.log) | -| 8x8x1 | MultiStep | short-side 320 | 8 | ResNet50
(TemporalPool) | ImageNet | 73.57 | 90.96 | 71.9
[\[Non-Local\]](https://github.com/facebookresearch/video-nonlocal-net#modifications-for-improving-speed) | 90.0
[\[Non-Local\]](https://github.com/facebookresearch/video-nonlocal-net#modifications-for-improving-speed) | 10 clips x 3 crops | 17006 | 19G | 24.3M | [config](/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb_20221027-3ca304fa.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.log) | -| 16x4x1 | MultiStep | short-side 320 | 8 | ResNet50
(TemporalPool) | ImageNet | 74.54 | 91.76 | x | x | 10 clips x 3 crops | 33630 | 39G | 24.3M | [config](/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb_20221027-5f382a43.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb.log) | +| frame sampling strategy | scheduler | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | gpu_mem(M) | FLOPs | params | config | ckpt | log | +| :---------------------: | :-------: | :--------: | :--: | :-----------: | :------: | :------: | :------: | :----------------------: | :----------------------: | :---------------: | :--------: | :---: | :----: | :----------: | :--------: | :-------: | +| 8x8x1 | MultiStep | 224x224 | 8 | ResNet50
| ImageNet | 73.44 | 91.00 | 67.2
[\[PySlowFast\]](https://github.com/facebookresearch/SlowFast/blob/main/MODEL_ZOO.md#kinetics-400-and-600) | 87.8
[\[PySlowFast\]](https://github.com/facebookresearch/SlowFast/blob/main/MODEL_ZOO.md#kinetics-400-and-600) | 10 clips x 3 crop | 21547 | 33G | 24.3M | [config](/configs/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb_20221027-e0227b22.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.log) | +| 8x8x1 | MultiStep | 224x224 | 8 | ResNet101
| ImageNet | 74.97 | 91.77 | x | x | 10 clips x 3 crop | 31836 | 63G | 43.3M | [config](/configs/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb_20221027-557bd8bc.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.log) | +| 8x8x1 | MultiStep | 224x224 | 8 | ResNet50
(TemporalPool) | ImageNet | 73.89 | 91.21 | 71.9
[\[Non-Local\]](https://github.com/facebookresearch/video-nonlocal-net#modifications-for-improving-speed) | 90.0
[\[Non-Local\]](https://github.com/facebookresearch/video-nonlocal-net#modifications-for-improving-speed) | 10 clips x 3 crop | 17006 | 19G | 24.3M | [config](/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb_20221027-3ca304fa.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.log) | +| 16x4x1 | MultiStep | 224x224 | 8 | ResNet50
(TemporalPool) | ImageNet | 74.97 | 91.91 | x | x | 10 clips x 3 crop | 33630 | 39G | 24.3M | [config](/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb_20221027-5f382a43.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb.log) | 1. The values in columns named after "reference" are the results reported in the original repo, using the same model settings. 2. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. 3. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. -For more details on data preparation, you can refer to [preparing_kinetics](/tools/data/kinetics/README.md). +For more details on data preparation, you can refer to [Kinetics400](/tools/data/kinetics/README.md). ## Train @@ -46,7 +46,7 @@ Example: train C2D model on Kinetics-400 dataset in a deterministic option with ```shell python tools/train.py configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.py \ - --cfg-options randomness.seed=0 randomness.deterministic=True + --seed 0 --deterministic ``` For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). diff --git a/configs/recognition/c2d/metafile.yml b/configs/recognition/c2d/metafile.yml index b629e5d55d..8d20761f8c 100644 --- a/configs/recognition/c2d/metafile.yml +++ b/configs/recognition/c2d/metafile.yml @@ -16,7 +16,7 @@ Models: FLOPs: 33G Parameters: 24.3M Pretrained: ImageNet - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -24,8 +24,8 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 73.16 - Top 5 Accuracy: 90.88 + Top 1 Accuracy: 73.44 + Top 5 Accuracy: 91.00 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb_20221027-e0227b22.pth @@ -39,7 +39,7 @@ Models: FLOPs: 63G Parameters: 43.3M Pretrained: ImageNet - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -47,8 +47,8 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 74.57 - Top 5 Accuracy: 91.60 + Top 1 Accuracy: 74.97 + Top 5 Accuracy: 91.77 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb_20221027-557bd8bc.pth @@ -62,7 +62,7 @@ Models: FLOPs: 19G Parameters: 24.3M Pretrained: ImageNet - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -70,8 +70,8 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 73.57 - Top 5 Accuracy: 90.96 + Top 1 Accuracy: 73.89 + Top 5 Accuracy: 91.21 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb_20221027-3ca304fa.pth @@ -85,7 +85,7 @@ Models: FLOPs: 39G Parameters: 24.3M Pretrained: ImageNet - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -93,7 +93,7 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 74.54 - Top 5 Accuracy: 91.76 + Top 1 Accuracy: 74.97 + Top 5 Accuracy: 91.91 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb_20221027-5f382a43.pth From 664369a2c8c47ec40454124d8f66071eaa8dc969 Mon Sep 17 00:00:00 2001 From: Dai-Wenxun Date: Tue, 6 Dec 2022 19:01:36 +0800 Subject: [PATCH 02/19] update c3d --- configs/recognition/c2d/README.md | 12 ++++++------ configs/recognition/c3d/README.md | 14 ++++++-------- ...-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py | 19 ++++++++++++++++--- configs/recognition/c3d/metafile.yml | 9 +++++---- 4 files changed, 33 insertions(+), 21 deletions(-) diff --git a/configs/recognition/c2d/README.md b/configs/recognition/c2d/README.md index af905ef0c4..651193dad2 100644 --- a/configs/recognition/c2d/README.md +++ b/configs/recognition/c2d/README.md @@ -21,12 +21,12 @@ Both convolutional and recurrent operations are building blocks that process one ### Kinetics-400 -| frame sampling strategy | scheduler | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | gpu_mem(M) | FLOPs | params | config | ckpt | log | -| :---------------------: | :-------: | :--------: | :--: | :-----------: | :------: | :------: | :------: | :----------------------: | :----------------------: | :---------------: | :--------: | :---: | :----: | :----------: | :--------: | :-------: | -| 8x8x1 | MultiStep | 224x224 | 8 | ResNet50
| ImageNet | 73.44 | 91.00 | 67.2
[\[PySlowFast\]](https://github.com/facebookresearch/SlowFast/blob/main/MODEL_ZOO.md#kinetics-400-and-600) | 87.8
[\[PySlowFast\]](https://github.com/facebookresearch/SlowFast/blob/main/MODEL_ZOO.md#kinetics-400-and-600) | 10 clips x 3 crop | 21547 | 33G | 24.3M | [config](/configs/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb_20221027-e0227b22.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.log) | -| 8x8x1 | MultiStep | 224x224 | 8 | ResNet101
| ImageNet | 74.97 | 91.77 | x | x | 10 clips x 3 crop | 31836 | 63G | 43.3M | [config](/configs/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb_20221027-557bd8bc.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.log) | -| 8x8x1 | MultiStep | 224x224 | 8 | ResNet50
(TemporalPool) | ImageNet | 73.89 | 91.21 | 71.9
[\[Non-Local\]](https://github.com/facebookresearch/video-nonlocal-net#modifications-for-improving-speed) | 90.0
[\[Non-Local\]](https://github.com/facebookresearch/video-nonlocal-net#modifications-for-improving-speed) | 10 clips x 3 crop | 17006 | 19G | 24.3M | [config](/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb_20221027-3ca304fa.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.log) | -| 16x4x1 | MultiStep | 224x224 | 8 | ResNet50
(TemporalPool) | ImageNet | 74.97 | 91.91 | x | x | 10 clips x 3 crop | 33630 | 39G | 24.3M | [config](/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb_20221027-5f382a43.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb.log) | +| frame sampling strategy | scheduler | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :-------: | :--------: | :--: | :-------------: | :------: | :------: | :------: | :-----------------------: | :-----------------------: | :---------------: | :---: | :----: | :------------: | :----------: | :---------: | +| 8x8x1 | MultiStep | 224x224 | 8 | ResNet50
| ImageNet | 73.44 | 91.00 | 67.2
[\[PySlowFast\]](https://github.com/facebookresearch/SlowFast/blob/main/MODEL_ZOO.md#kinetics-400-and-600) | 87.8
[\[PySlowFast\]](https://github.com/facebookresearch/SlowFast/blob/main/MODEL_ZOO.md#kinetics-400-and-600) | 10 clips x 3 crop | 33G | 24.3M | [config](/configs/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb_20221027-e0227b22.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.log) | +| 8x8x1 | MultiStep | 224x224 | 8 | ResNet101
| ImageNet | 74.97 | 91.77 | x | x | 10 clips x 3 crop | 63G | 43.3M | [config](/configs/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb_20221027-557bd8bc.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r101-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.log) | +| 8x8x1 | MultiStep | 224x224 | 8 | ResNet50
(TemporalPool) | ImageNet | 73.89 | 91.21 | 71.9
[\[Non-Local\]](https://github.com/facebookresearch/video-nonlocal-net#modifications-for-improving-speed) | 90.0
[\[Non-Local\]](https://github.com/facebookresearch/video-nonlocal-net#modifications-for-improving-speed) | 10 clips x 3 crop | 19G | 24.3M | [config](/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb_20221027-3ca304fa.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.log) | +| 16x4x1 | MultiStep | 224x224 | 8 | ResNet50
(TemporalPool) | ImageNet | 74.97 | 91.91 | x | x | 10 clips x 3 crop | 39G | 24.3M | [config](/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb_20221027-5f382a43.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb.log) | 1. The values in columns named after "reference" are the results reported in the original repo, using the same model settings. 2. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. diff --git a/configs/recognition/c3d/README.md b/configs/recognition/c3d/README.md index fb5f4036a4..27a84af87c 100644 --- a/configs/recognition/c3d/README.md +++ b/configs/recognition/c3d/README.md @@ -20,16 +20,14 @@ We propose a simple, yet effective approach for spatiotemporal feature learning ### UCF-101 -| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | inference_time(video/s) | gpu_mem(M) | config | ckpt | log | -| :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :---------------: | :---------------------: | :--------: | :--------------------------: | :-------------------------: | :------------------------: | -| 16x1x1 | raw | 8 | c3d | sports1m | 82.92 | 96.11 | 10 clips x 1 crop | x | 6067 | [config](/configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb_20220811-31723200.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.log) | +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :---------------: | :---: | :----: | :----------------------------------: | :--------------------------------: | :-------------------------------: | +| 16x1x1 | 112x112 | 8 | c3d | sports1m | 83.08 | 95.93 | 10 clips x 1 crop | 385G | 78.4M | [config](/configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb_20220811-31723200.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.log) | 1. The author of C3D normalized UCF-101 with volume mean and used SVM to classify videos, while we normalized the dataset with RGB mean value and used a linear classifier. -2. The **gpus** indicates the number of gpu (80G A100) we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default. - According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU, - e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu. +2. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. -For more details on data preparation, you can refer to the **Prepare videos** part in the [Data Preparation Tutorial](/docs/en/user_guides/2_data_prepare.md). +For more details on data preparation, you can refer to [UCF101](/tools/data/ucf101/README.md). ## Train @@ -43,7 +41,7 @@ Example: train C3D model on UCF-101 dataset in a deterministic option with perio ```shell python tools/train.py configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py \ - --cfg-options randomness.seed=0 randomness.deterministic=True + --seed=0 --deterministic ``` For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). diff --git a/configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py b/configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py index b87df85c01..d89534ae9a 100644 --- a/configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py +++ b/configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py @@ -11,8 +11,15 @@ ann_file_train = f'data/ucf101/ucf101_train_split_{split}_videos.txt' ann_file_val = f'data/ucf101/ucf101_val_split_{split}_videos.txt' ann_file_test = f'data/ucf101/ucf101_val_split_{split}_videos.txt' + +# file_client_args = dict( +# io_backend='petrel', +# path_mapping=dict( +# {'data/ucf101': 's3://openmmlab/datasets/action/ucf101'})) +file_client_args = dict(io_backend='disk') + train_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=16, frame_interval=1, num_clips=1), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 128)), @@ -22,7 +29,7 @@ dict(type='PackActionInputs') ] val_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='SampleFrames', clip_len=16, @@ -36,7 +43,7 @@ dict(type='PackActionInputs') ] test_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='SampleFrames', clip_len=16, @@ -106,3 +113,9 @@ clip_grad=dict(max_norm=40, norm_type=2)) default_hooks = dict(checkpoint=dict(interval=5)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (30 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=240) diff --git a/configs/recognition/c3d/metafile.yml b/configs/recognition/c3d/metafile.yml index 26becfc8a2..c25bed15bc 100644 --- a/configs/recognition/c3d/metafile.yml +++ b/configs/recognition/c3d/metafile.yml @@ -13,9 +13,10 @@ Models: Architecture: c3d Batch Size: 30 Epochs: 45 - FLOPs: 38615475200 - Parameters: 78409573 + FLOPs: 385G + Parameters: 78.4M Pretrained: sports1m + Resolution: 112x112 Training Data: UCF101 Training Resources: 8 GPUs Modality: RGB @@ -23,7 +24,7 @@ Models: - Dataset: UCF101 Task: Action Recognition Metrics: - Top 1 Accuracy: 82.92 - Top 5 Accuracy: 96.11 + Top 1 Accuracy: 83.08 + Top 5 Accuracy: 95.93 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb_20220811-31723200.pth From 50fda130438632e2b25cd14956b0f8a56150963f Mon Sep 17 00:00:00 2001 From: Dai-Wenxun Date: Wed, 7 Dec 2022 23:05:09 +0800 Subject: [PATCH 03/19] update i3d --- configs/recognition/i3d/README.md | 28 ++++---- ...roduct_8xb8-32x2x1-100e_kinetics400-rgb.py | 18 ++++- ...ed-r50_8xb8-32x2x1-100e_kinetics400-rgb.py | 18 ++++- ..._8xb8-dense-32x2x1-100e_kinetics400-rgb.py | 11 +++- configs/recognition/i3d/metafile.yml | 66 ++++++++++--------- 5 files changed, 84 insertions(+), 57 deletions(-) diff --git a/configs/recognition/i3d/README.md b/configs/recognition/i3d/README.md index 66703707c4..094084f196 100644 --- a/configs/recognition/i3d/README.md +++ b/configs/recognition/i3d/README.md @@ -22,23 +22,19 @@ The paucity of videos in current action classification datasets (UCF-101 and HMD ### Kinetics-400 -| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | inference time(video/s) | gpu_mem(M) | config | ckpt | log | -| :---------------------: | :------------: | :--: | :----------------------: | :------: | :------: | :------: | :---------------: | :---------------------: | :--------: | :--------------------: | :------------------: | :-----------------: | -| 32x2x1 | short-side 320 | 8 | ResNet50 (NonLocalDotProduct) | ImageNet | 74.76 | 91.84 | 10 clips x 3 crop | x | 6245 | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb_20220812-8e1f2148.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.log) | -| 32x2x1 | short-side 320 | 8 | ResNet50 (NonLocalEmbedGauss) | ImageNet | 74.69 | 91.69 | 10 clips x 3 crop | x | 6415 | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/ii3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb_20220812-afd8f562.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.log) | -| 32x2x1 | short-side 320 | 8 | ResNet50 (NonLocalGauss) | ImageNet | 73.90 | 91.15 | 10 clips x 3 crop | x | 6108 | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb_20220812-0c5cbf5a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.log) | -| 32x2x1 | short-side 320 | 8 | ResNet50 | ImageNet | 73.22 | 91.11 | 10 clips x 3 crop | x | 5149 | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb_20220812-e213c223.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.log) | -| dense-32x2x1 | short-side 320 | 8 | ResNet50 | ImageNet | 73.77 | 91.35 | 10 clips x 3 crop | x | 5151 | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb_20220812-9f46003f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.log) | -| 32x2x1 | short-side 320 | 8 | ResNet50 (Heavy) | ImageNet | 76.08 | 92.34 | 10 clips x 3 crop | x | 17350 | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb_20220812-ed501b31.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb.log) | - -1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default. - According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU, - e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu. +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :---------------------------: | :------: | :------: | :------: | :---------------: | :---: | :----: | :---------------------------: | :-------------------------: | :------------------------: | +| 32x2x1 | 224x224 | 8 | ResNet50 (NonLocalDotProduct) | ImageNet | 74.80 | 92.07 | 10 clips x 3 crop | 1779G | 35.4M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb_20220812-8e1f2148.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | ResNet50 (NonLocalEmbedGauss) | ImageNet | 74.73 | 91.80 | 10 clips x 3 crop | 1779G | 35.4M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb_20220812-afd8f562.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | ResNet50 (NonLocalGauss) | ImageNet | 73.97 | 91.33 | 10 clips x 3 crop | 1695G | 31.7M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb_20220812-0c5cbf5a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | ResNet50 | ImageNet | 73.47 | 91.27 | 10 clips x 3 crop | 1304G | 28.0M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb_20220812-e213c223.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.log) | +| dense-32x2x1 | 224x224 | 8 | ResNet50 | ImageNet | 73.77 | 91.35 | 10 clips x 3 crop | 1304G | 28.0M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb_20220812-9f46003f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | ResNet50 (Heavy) | ImageNet | 76.21 | 92.48 | 10 clips x 3 crop | 4988G | 33.0M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb_20220812-ed501b31.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb.log) | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. 2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. -For more details on data preparation, you can refer to - -- [preparing_kinetics](/tools/data/kinetics/README.md) +For more details on data preparation, you can refer to [Kinetics400](/tools/data/kinetics/README.md). ## Train @@ -52,7 +48,7 @@ Example: train I3D model on Kinetics-400 dataset in a deterministic option with ```shell python tools/train.py configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py \ - --cfg-options randomness.seed=0 randomness.deterministic=True + --seed=0 --deterministic ``` For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). diff --git a/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py b/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py index 95110c3327..9c835e34cd 100644 --- a/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py +++ b/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py @@ -20,8 +20,14 @@ ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +# file_client_args = dict( +# io_backend='petrel', +# path_mapping=dict( +# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) +file_client_args = dict(io_backend='disk') train_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), @@ -37,7 +43,7 @@ dict(type='PackActionInputs') ] val_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='SampleFrames', clip_len=32, @@ -51,7 +57,7 @@ dict(type='PackActionInputs') ] test_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='SampleFrames', clip_len=32, @@ -102,3 +108,9 @@ test_evaluator = val_evaluator default_hooks = dict(checkpoint=dict(interval=5, max_keep_ckpts=5)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py b/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py index d36054d2a5..28f23521f1 100644 --- a/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py +++ b/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py @@ -10,8 +10,14 @@ ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +# file_client_args = dict( +# io_backend='petrel', +# path_mapping=dict( +# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) +file_client_args = dict(io_backend='disk') train_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), @@ -27,7 +33,7 @@ dict(type='PackActionInputs') ] val_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='SampleFrames', clip_len=32, @@ -41,7 +47,7 @@ dict(type='PackActionInputs') ] test_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='SampleFrames', clip_len=32, @@ -93,3 +99,9 @@ test_evaluator = val_evaluator default_hooks = dict(checkpoint=dict(interval=5, max_keep_ckpts=5)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py b/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py index 5215659677..9bfb7a4063 100644 --- a/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py +++ b/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py @@ -7,8 +7,13 @@ ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' +# file_client_args = dict( +# io_backend='petrel', +# path_mapping=dict( +# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) +file_client_args = dict(io_backend='disk') train_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict(type='DenseSampleFrames', clip_len=32, frame_interval=2, num_clips=1), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), @@ -24,7 +29,7 @@ dict(type='PackActionInputs') ] val_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='DenseSampleFrames', clip_len=32, @@ -38,7 +43,7 @@ dict(type='PackActionInputs') ] test_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='DenseSampleFrames', clip_len=32, diff --git a/configs/recognition/i3d/metafile.yml b/configs/recognition/i3d/metafile.yml index dc94092b51..3ab946fe24 100644 --- a/configs/recognition/i3d/metafile.yml +++ b/configs/recognition/i3d/metafile.yml @@ -1,9 +1,9 @@ Collections: -- Name: I3D - README: configs/recognition/i3d/README.md - Paper: - URL: https://arxiv.org/abs/1705.07750 - Title: 'Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset' + - Name: I3D + README: configs/recognition/i3d/README.md + Paper: + URL: https://arxiv.org/abs/1705.07750 + Title: 'Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset' Models: - Name: i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb @@ -13,10 +13,10 @@ Models: Architecture: ResNet50 Batch Size: 8 Epochs: 100 - FLOPs: 54334488576 - Parameters: 35397840 + FLOPs: 1779G + Parameters: 35.4M Pretrained: ImageNet - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -24,8 +24,8 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 74.76 - Top 5 Accuracy: 91.84 + Top 1 Accuracy: 74.80 + Top 5 Accuracy: 92.07 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb_20220812-8e1f2148.pth @@ -36,10 +36,10 @@ Models: Architecture: ResNet50 Batch Size: 8 Epochs: 100 - FLOPs: 54334488576 - Parameters: 35397840 + FLOPs: 1779G + Parameters: 35.4M Pretrained: ImageNet - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -47,10 +47,10 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 74.69 - Top 5 Accuracy: 91.69 + Top 1 Accuracy: 74.73 + Top 5 Accuracy: 91.80 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/ii3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb_20220812-afd8f562.pth + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb_20220812-afd8f562.pth - Name: i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb Config: configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.py @@ -59,10 +59,10 @@ Models: Architecture: ResNet50 Batch Size: 8 Epochs: 100 - FLOPs: 48962109440 - Parameters: 31723728 + FLOPs: 1695G + Parameters: 31.7M Pretrained: ImageNet - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -70,8 +70,8 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 73.90 - Top 5 Accuracy: 91.15 + Top 1 Accuracy: 73.97 + Top 5 Accuracy: 91.33 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb_20220812-0c5cbf5a.pth @@ -82,10 +82,10 @@ Models: Architecture: ResNet50 Batch Size: 8 Epochs: 100 - FLOPs: 43564040192 - Parameters: 28043472 + FLOPs: 1304G + Parameters: 28.0M Pretrained: ImageNet - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -93,8 +93,8 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 73.22 - Top 5 Accuracy: 91.11 + Top 1 Accuracy: 73.47 + Top 5 Accuracy: 91.27 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb_20220812-e213c223.pth @@ -105,10 +105,10 @@ Models: Architecture: ResNet50 Batch Size: 8 Epochs: 100 - FLOPs: 43564040192 - Parameters: 28043472 + FLOPs: 1304G + Parameters: 28.0M Pretrained: ImageNet - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -128,8 +128,10 @@ Models: Architecture: ResNet50 Batch Size: 8 Epochs: 100 + FLOPs: 4988G + Parameters: 33.0M Pretrained: ImageNet - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -137,7 +139,7 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 76.08 - Top 5 Accuracy: 92.34 + Top 1 Accuracy: 76.21 + Top 5 Accuracy: 92.48 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb_20220812-ed501b31.pth From 2d1cdd857ed31fc68d89e52fa4a71908c7e8a9e0 Mon Sep 17 00:00:00 2001 From: Dai-Wenxun Date: Thu, 8 Dec 2022 14:45:15 +0800 Subject: [PATCH 04/19] update slowfast --- ..._8xb8-dense-32x2x1-100e_kinetics400-rgb.py | 1 + configs/recognition/slowfast/README.md | 24 ++++----- configs/recognition/slowfast/metafile.yml | 50 +++++++++---------- ...st_r50_8xb8-4x16x1-256e_kinetics400-rgb.py | 12 +++-- 4 files changed, 46 insertions(+), 41 deletions(-) diff --git a/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py b/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py index 9bfb7a4063..57e592ca8d 100644 --- a/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py +++ b/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py @@ -7,6 +7,7 @@ ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + # file_client_args = dict( # io_backend='petrel', # path_mapping=dict( diff --git a/configs/recognition/slowfast/README.md b/configs/recognition/slowfast/README.md index 389d8e98f0..4d03f0e38a 100644 --- a/configs/recognition/slowfast/README.md +++ b/configs/recognition/slowfast/README.md @@ -20,17 +20,15 @@ We present SlowFast networks for video recognition. Our model involves (i) a Slo ### Kinetics-400 -| frame sampling strategy | scheduler | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | inference time(video/s) | gpu_mem(M) | config | ckpt | log | -| :---------------------: | :--------------: | :------------: | :--: | :------------------: | :------: | :------: | :------: | :---------------: | :---------------------: | :--------: | :----------------: | :--------------: | :-------------: | -| 4x16x1 | Linear+Cosine | short-side 320 | 8 | ResNet50 | None | 75.27 | 92.27 | 10 clips x 3 crop | x | 6332 | [config](/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb_20220901-701b0f6f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.log) | -| 8x8x1 | Linear+Cosine | short-side 320 | 8 | ResNet50 | None | 76.31 | 92.88 | 10 clips x 3 crop | x | 9201 | [config](/configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb_20220818-1cb6dfc8.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.log) | -| 8x8x1 | Linear+MultiStep | short-side 320 | 8 | ResNet50 | None | 76.33 | 92.66 | 10 clips x 3 crop | x | 9395 | [config](/configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb_20220818-b62a501f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb.log) | -| 8x8x1 | Linear+Cosine | short-side 320 | 8 | ResNet101 | None | 78.30 | 93.77 | 10 clips x 3 crop | x | 13431 | [config](/configs/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb_20220818-9c0e09bd.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb.log) | -| 4x16x1 | Linear+Cosine | short-side 320 | 32 | ResNet101 + ResNet50 | None | 76.68 | 92.82 | 10 clips x 3 crop | x | 8039 | [config](/configs/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb_20220901-a77ac3ee.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb.log) | - -1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default. - According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU, - e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu. +| frame sampling strategy | scheduler | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------------: | :--------: | :--: | :------------------: | :------: | :------: | :------: | :---------------: | :---: | :----: | :------------------------: | :-----------------------: | :----------------------: | +| 4x16x1 | Linear+Cosine | 224x224 | 8 | ResNet50 | None | 75.55 | 92.35 | 10 clips x 3 crop | 1090G | 34.5M | [config](/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb_20220901-701b0f6f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.log) | +| 8x8x1 | Linear+Cosine | 224x224 | 8 | ResNet50 | None | 76.80 | 92.99 | 10 clips x 3 crop | 1982G | 34.6M | [config](/configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb_20220818-1cb6dfc8.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.log) | +| 8x8x1 | Linear+MultiStep | 224x224 | 8 | ResNet50 | None | 76.65 | 92.86 | 10 clips x 3 crop | 1982G | 34.6M | [config](/configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb_20220818-b62a501f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb.log) | +| 8x8x1 | Linear+Cosine | 224x224 | 8 | ResNet101 | None | 78.65 | 93.88 | 10 clips x 3 crop | 3805G | 62.9M | [config](/configs/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb_20220818-9c0e09bd.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb.log) | +| 4x16x1 | Linear+Cosine | 224x224 | 32 | ResNet101 + ResNet50 | None | 77.03 | 92.99 | 10 clips x 3 crop | 1947G | 62.4M | [config](/configs/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb_20220901-a77ac3ee.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb.log) | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. 2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. For more details on data preparation, you can refer to the **Prepare videos** part in the [Data Preparation Tutorial](/docs/en/user_guides/2_data_prepare.md). @@ -47,10 +45,10 @@ Example: train SlowFast model on Kinetics-400 dataset in a deterministic option ```shell python tools/train.py configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py \ - --cfg-options randomness.seed=0 randomness.deterministic=True + --seed=0 --deterministic ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details on data preparation, you can refer to [Kinetics400](/tools/data/kinetics/README.md). ## Test diff --git a/configs/recognition/slowfast/metafile.yml b/configs/recognition/slowfast/metafile.yml index 8d81d510f3..ad5550432d 100644 --- a/configs/recognition/slowfast/metafile.yml +++ b/configs/recognition/slowfast/metafile.yml @@ -13,10 +13,10 @@ Models: Architecture: ResNet50 Batch Size: 8 Epochs: 256 - FLOPs: 36441296896 - Parameters: 34479288 + FLOPs: 1090G + Parameters: 34.5M Pretrained: None - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -24,8 +24,8 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 75.27 - Top 5 Accuracy: 92.27 + Top 1 Accuracy: 75.55 + Top 5 Accuracy: 92.35 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb_20220901-701b0f6f.pth @@ -36,10 +36,10 @@ Models: Architecture: ResNet50 Batch Size: 8 Epochs: 256 - FLOPs: 66222034944 - Parameters: 34565560 + FLOPs: 1982G + Parameters: 34.6M Pretrained: None - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -47,8 +47,8 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 76.31 - Top 5 Accuracy: 92.88 + Top 1 Accuracy: 76.80 + Top 5 Accuracy: 92.99 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb_20220818-1cb6dfc8.pth @@ -59,10 +59,10 @@ Models: Architecture: ResNet50 Batch Size: 8 Epochs: 256 - FLOPs: 66222034944 - Parameters: 34565560 + FLOPs: 1982G + Parameters: 34.6M Pretrained: None - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -70,8 +70,8 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 76.33 - Top 5 Accuracy: 92.66 + Top 1 Accuracy: 76.65 + Top 5 Accuracy: 92.86 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb_20220818-b62a501f.pth @@ -82,10 +82,10 @@ Models: Architecture: ResNet101 Batch Size: 8 Epochs: 256 - FLOPs: 127070375936 - Parameters: 62912312 + FLOPs: 3805G + Parameters: 62.9M Pretrained: None - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -93,8 +93,8 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 78.30 - Top 5 Accuracy: 93.77 + Top 1 Accuracy: 78.65 + Top 5 Accuracy: 93.88 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb_20220901-9c0e09bd.pth @@ -105,18 +105,18 @@ Models: Architecture: ResNet101 + ResNet50 Batch Size: 8 Epochs: 256 - FLOPs: 65042780160 - Parameters: 62384312 + FLOPs: 1947G + Parameters: 62.4M Pretrained: None - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 32 GPUs Modality: RGB Results: - Dataset: Kinetics-400 Metrics: - Top 1 Accuracy: 76.68 - Top 5 Accuracy: 92.82 + Top 1 Accuracy: 77.03 + Top 5 Accuracy: 92.99 Task: Action Recognition Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb_20220901-a77ac3ee.pth diff --git a/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py b/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py index ed4fbdac3c..e6698ddb9c 100644 --- a/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py +++ b/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py @@ -8,8 +8,14 @@ ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +# file_client_args = dict( +# io_backend='petrel', +# path_mapping=dict( +# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) +file_client_args = dict(io_backend='disk') train_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), @@ -20,7 +26,7 @@ dict(type='PackActionInputs') ] val_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='SampleFrames', clip_len=32, @@ -34,7 +40,7 @@ dict(type='PackActionInputs') ] test_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='SampleFrames', clip_len=32, From be4ded20e4bb6ca3b53918f14ef4c8c899779132 Mon Sep 17 00:00:00 2001 From: Dai-Wenxun Date: Thu, 8 Dec 2022 15:02:20 +0800 Subject: [PATCH 05/19] fix slowfast --- configs/recognition/slowfast/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/recognition/slowfast/README.md b/configs/recognition/slowfast/README.md index 4d03f0e38a..9fa48615b1 100644 --- a/configs/recognition/slowfast/README.md +++ b/configs/recognition/slowfast/README.md @@ -31,7 +31,7 @@ We present SlowFast networks for video recognition. Our model involves (i) a Slo 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. 2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. -For more details on data preparation, you can refer to the **Prepare videos** part in the [Data Preparation Tutorial](/docs/en/user_guides/2_data_prepare.md). +For more details on data preparation, you can refer to [Kinetics400](/tools/data/kinetics/README.md). ## Train From c57a27009d84b1c6626e59c3bc08467c5e0bc58f Mon Sep 17 00:00:00 2001 From: Dai-Wenxun Date: Thu, 8 Dec 2022 19:28:57 +0800 Subject: [PATCH 06/19] fix flops --- configs/recognition/c3d/README.md | 2 +- configs/recognition/c3d/metafile.yml | 2 +- configs/recognition/i3d/README.md | 16 ++++---- configs/recognition/i3d/metafile.yml | 12 +++--- configs/recognition/slowfast/README.md | 10 ++--- configs/recognition/slowfast/metafile.yml | 10 ++--- configs/recognition/swin/README.md | 22 +++++----- configs/recognition/swin/metafile.yml | 40 +++++++++---------- ...pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py | 10 ++--- ...re_16xb8-amp-32x2x1-30e_kinetics700-rgb.py | 10 ++--- ...pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py | 10 ++--- ...pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py | 10 ++--- ...pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py | 10 ++--- 13 files changed, 82 insertions(+), 82 deletions(-) diff --git a/configs/recognition/c3d/README.md b/configs/recognition/c3d/README.md index 27a84af87c..958119f048 100644 --- a/configs/recognition/c3d/README.md +++ b/configs/recognition/c3d/README.md @@ -22,7 +22,7 @@ We propose a simple, yet effective approach for spatiotemporal feature learning | frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | | :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :---------------: | :---: | :----: | :----------------------------------: | :--------------------------------: | :-------------------------------: | -| 16x1x1 | 112x112 | 8 | c3d | sports1m | 83.08 | 95.93 | 10 clips x 1 crop | 385G | 78.4M | [config](/configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb_20220811-31723200.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.log) | +| 16x1x1 | 112x112 | 8 | c3d | sports1m | 83.08 | 95.93 | 10 clips x 1 crop | 38.5G | 78.4M | [config](/configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb_20220811-31723200.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.log) | 1. The author of C3D normalized UCF-101 with volume mean and used SVM to classify videos, while we normalized the dataset with RGB mean value and used a linear classifier. 2. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. diff --git a/configs/recognition/c3d/metafile.yml b/configs/recognition/c3d/metafile.yml index c25bed15bc..9944352256 100644 --- a/configs/recognition/c3d/metafile.yml +++ b/configs/recognition/c3d/metafile.yml @@ -13,7 +13,7 @@ Models: Architecture: c3d Batch Size: 30 Epochs: 45 - FLOPs: 385G + FLOPs: 38.5G Parameters: 78.4M Pretrained: sports1m Resolution: 112x112 diff --git a/configs/recognition/i3d/README.md b/configs/recognition/i3d/README.md index 094084f196..e181eaf195 100644 --- a/configs/recognition/i3d/README.md +++ b/configs/recognition/i3d/README.md @@ -22,14 +22,14 @@ The paucity of videos in current action classification datasets (UCF-101 and HMD ### Kinetics-400 -| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | -| :---------------------: | :--------: | :--: | :---------------------------: | :------: | :------: | :------: | :---------------: | :---: | :----: | :---------------------------: | :-------------------------: | :------------------------: | -| 32x2x1 | 224x224 | 8 | ResNet50 (NonLocalDotProduct) | ImageNet | 74.80 | 92.07 | 10 clips x 3 crop | 1779G | 35.4M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb_20220812-8e1f2148.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.log) | -| 32x2x1 | 224x224 | 8 | ResNet50 (NonLocalEmbedGauss) | ImageNet | 74.73 | 91.80 | 10 clips x 3 crop | 1779G | 35.4M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb_20220812-afd8f562.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.log) | -| 32x2x1 | 224x224 | 8 | ResNet50 (NonLocalGauss) | ImageNet | 73.97 | 91.33 | 10 clips x 3 crop | 1695G | 31.7M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb_20220812-0c5cbf5a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.log) | -| 32x2x1 | 224x224 | 8 | ResNet50 | ImageNet | 73.47 | 91.27 | 10 clips x 3 crop | 1304G | 28.0M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb_20220812-e213c223.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.log) | -| dense-32x2x1 | 224x224 | 8 | ResNet50 | ImageNet | 73.77 | 91.35 | 10 clips x 3 crop | 1304G | 28.0M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb_20220812-9f46003f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.log) | -| 32x2x1 | 224x224 | 8 | ResNet50 (Heavy) | ImageNet | 76.21 | 92.48 | 10 clips x 3 crop | 4988G | 33.0M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb_20220812-ed501b31.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb.log) | +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :---------------------------: | :------: | :------: | :------: | :---------------: | :----: | :----: | :--------------------------: | :-------------------------: | :------------------------: | +| 32x2x1 | 224x224 | 8 | ResNet50 (NonLocalDotProduct) | ImageNet | 74.80 | 92.07 | 10 clips x 3 crop | 59.3G | 35.4M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb_20220812-8e1f2148.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | ResNet50 (NonLocalEmbedGauss) | ImageNet | 74.73 | 91.80 | 10 clips x 3 crop | 59.3G | 35.4M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb_20220812-afd8f562.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-embedded-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | ResNet50 (NonLocalGauss) | ImageNet | 73.97 | 91.33 | 10 clips x 3 crop | 56.5 | 31.7M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb_20220812-0c5cbf5a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-gaussian_8xb8-32x2x1-100e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | ResNet50 | ImageNet | 73.47 | 91.27 | 10 clips x 3 crop | 43.5G | 28.0M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb_20220812-e213c223.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.log) | +| dense-32x2x1 | 224x224 | 8 | ResNet50 | ImageNet | 73.77 | 91.35 | 10 clips x 3 crop | 43.5G | 28.0M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb_20220812-9f46003f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | ResNet50 (Heavy) | ImageNet | 76.21 | 92.48 | 10 clips x 3 crop | 166.3G | 33.0M | [config](/configs/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb_20220812-ed501b31.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-heavy_8xb8-32x2x1-100e_kinetics400-rgb.log) | 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. 2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. diff --git a/configs/recognition/i3d/metafile.yml b/configs/recognition/i3d/metafile.yml index 3ab946fe24..63ad017343 100644 --- a/configs/recognition/i3d/metafile.yml +++ b/configs/recognition/i3d/metafile.yml @@ -13,7 +13,7 @@ Models: Architecture: ResNet50 Batch Size: 8 Epochs: 100 - FLOPs: 1779G + FLOPs: 59.3G Parameters: 35.4M Pretrained: ImageNet Resolution: 224x224 @@ -36,7 +36,7 @@ Models: Architecture: ResNet50 Batch Size: 8 Epochs: 100 - FLOPs: 1779G + FLOPs: 59.3G Parameters: 35.4M Pretrained: ImageNet Resolution: 224x224 @@ -59,7 +59,7 @@ Models: Architecture: ResNet50 Batch Size: 8 Epochs: 100 - FLOPs: 1695G + FLOPs: 56.5G Parameters: 31.7M Pretrained: ImageNet Resolution: 224x224 @@ -82,7 +82,7 @@ Models: Architecture: ResNet50 Batch Size: 8 Epochs: 100 - FLOPs: 1304G + FLOPs: 43.5G Parameters: 28.0M Pretrained: ImageNet Resolution: 224x224 @@ -105,7 +105,7 @@ Models: Architecture: ResNet50 Batch Size: 8 Epochs: 100 - FLOPs: 1304G + FLOPs: 43.5G Parameters: 28.0M Pretrained: ImageNet Resolution: 224x224 @@ -128,7 +128,7 @@ Models: Architecture: ResNet50 Batch Size: 8 Epochs: 100 - FLOPs: 4988G + FLOPs: 166.3G Parameters: 33.0M Pretrained: ImageNet Resolution: 224x224 diff --git a/configs/recognition/slowfast/README.md b/configs/recognition/slowfast/README.md index 9fa48615b1..762278e416 100644 --- a/configs/recognition/slowfast/README.md +++ b/configs/recognition/slowfast/README.md @@ -22,11 +22,11 @@ We present SlowFast networks for video recognition. Our model involves (i) a Slo | frame sampling strategy | scheduler | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | | :---------------------: | :--------------: | :--------: | :--: | :------------------: | :------: | :------: | :------: | :---------------: | :---: | :----: | :------------------------: | :-----------------------: | :----------------------: | -| 4x16x1 | Linear+Cosine | 224x224 | 8 | ResNet50 | None | 75.55 | 92.35 | 10 clips x 3 crop | 1090G | 34.5M | [config](/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb_20220901-701b0f6f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.log) | -| 8x8x1 | Linear+Cosine | 224x224 | 8 | ResNet50 | None | 76.80 | 92.99 | 10 clips x 3 crop | 1982G | 34.6M | [config](/configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb_20220818-1cb6dfc8.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.log) | -| 8x8x1 | Linear+MultiStep | 224x224 | 8 | ResNet50 | None | 76.65 | 92.86 | 10 clips x 3 crop | 1982G | 34.6M | [config](/configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb_20220818-b62a501f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb.log) | -| 8x8x1 | Linear+Cosine | 224x224 | 8 | ResNet101 | None | 78.65 | 93.88 | 10 clips x 3 crop | 3805G | 62.9M | [config](/configs/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb_20220818-9c0e09bd.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb.log) | -| 4x16x1 | Linear+Cosine | 224x224 | 32 | ResNet101 + ResNet50 | None | 77.03 | 92.99 | 10 clips x 3 crop | 1947G | 62.4M | [config](/configs/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb_20220901-a77ac3ee.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb.log) | +| 4x16x1 | Linear+Cosine | 224x224 | 8 | ResNet50 | None | 75.55 | 92.35 | 10 clips x 3 crop | 36.3G | 34.5M | [config](/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb_20220901-701b0f6f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.log) | +| 8x8x1 | Linear+Cosine | 224x224 | 8 | ResNet50 | None | 76.80 | 92.99 | 10 clips x 3 crop | 66.1G | 34.6M | [config](/configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb_20220818-1cb6dfc8.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.log) | +| 8x8x1 | Linear+MultiStep | 224x224 | 8 | ResNet50 | None | 76.65 | 92.86 | 10 clips x 3 crop | 66.1G | 34.6M | [config](/configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb_20220818-b62a501f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-steplr-256e_kinetics400-rgb.log) | +| 8x8x1 | Linear+Cosine | 224x224 | 8 | ResNet101 | None | 78.65 | 93.88 | 10 clips x 3 crop | 126G | 62.9M | [config](/configs/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb_20220818-9c0e09bd.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r101_8xb8-8x8x1-256e_kinetics400-rgb.log) | +| 4x16x1 | Linear+Cosine | 224x224 | 32 | ResNet101 + ResNet50 | None | 77.03 | 92.99 | 10 clips x 3 crop | 64.9G | 62.4M | [config](/configs/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb_20220901-a77ac3ee.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb/slowfast_r101-r50_32xb8-4x16x1-256e_kinetics400-rgb.log) | 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. 2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. diff --git a/configs/recognition/slowfast/metafile.yml b/configs/recognition/slowfast/metafile.yml index ad5550432d..94423659d1 100644 --- a/configs/recognition/slowfast/metafile.yml +++ b/configs/recognition/slowfast/metafile.yml @@ -13,7 +13,7 @@ Models: Architecture: ResNet50 Batch Size: 8 Epochs: 256 - FLOPs: 1090G + FLOPs: 36.3G Parameters: 34.5M Pretrained: None Resolution: 224x224 @@ -36,7 +36,7 @@ Models: Architecture: ResNet50 Batch Size: 8 Epochs: 256 - FLOPs: 1982G + FLOPs: 66.1G Parameters: 34.6M Pretrained: None Resolution: 224x224 @@ -59,7 +59,7 @@ Models: Architecture: ResNet50 Batch Size: 8 Epochs: 256 - FLOPs: 1982G + FLOPs: 66.1G Parameters: 34.6M Pretrained: None Resolution: 224x224 @@ -82,7 +82,7 @@ Models: Architecture: ResNet101 Batch Size: 8 Epochs: 256 - FLOPs: 3805G + FLOPs: 126G Parameters: 62.9M Pretrained: None Resolution: 224x224 @@ -105,7 +105,7 @@ Models: Architecture: ResNet101 + ResNet50 Batch Size: 8 Epochs: 256 - FLOPs: 1947G + FLOPs: 64.9G Parameters: 62.4M Pretrained: None Resolution: 224x224 diff --git a/configs/recognition/swin/README.md b/configs/recognition/swin/README.md index c36a47b39e..c5ad4d51bb 100644 --- a/configs/recognition/swin/README.md +++ b/configs/recognition/swin/README.md @@ -20,25 +20,25 @@ The vision community is witnessing a modeling shift from CNNs to Transformers, w ### Kinetics-400 -| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top1 acc | testing protocol | gpu_mem(M) | FLOPs | params | config | ckpt | log | -| :---------------------: | :------------: | :--: | :------: | :----------: | :------: | :------: | :-----------------------: | :-----------------------: | :---------------: | :--------: | :---: | :----: | :-----------: | :---------: | :---------: | -| 32x2x1 | short-side 320 | 8 | Swin-T | ImageNet-1k | 78.29 | 93.58 | 78.46 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 93.46 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 4 clips x 3 crops | 21072 | 88G | 28.2M | [config](/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-241016b2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | -| 32x2x1 | short-side 320 | 8 | Swin-S | ImageNet-1k | 80.23 | 94.32 | 80.23 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 94.16 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 4 clips x 3 crops | 33632 | 166G | 49.8M | [config](/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-e91ab986.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | -| 32x2x1 | short-side 320 | 8 | Swin-B | ImageNet-1k | 80.21 | 94.32 | 80.27 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 94.42 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 4 clips x 3 crops | 45143 | 282G | 88.0M | [config](/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-182ec6cc.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | -| 32x2x1 | short-side 320 | 8 | Swin-L | ImageNet-22k | 83.15 | 95.76 | 83.1\* | 95.9\* | 4 clips x 3 crops | 68881 | 604G | 197M | [config](/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-78ad8b11.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top1 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :----------: | :------: | :------: | :--------------------------: | :--------------------------: | :--------------: | :---: | :----: | :--------------: | :------------: | :------------: | +| 32x2x1 | 224x224 | 8 | Swin-T | ImageNet-1k | 78.90 | 93.77 | 78.46 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 93.46 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 4 clips x 3 crop | 88G | 28.2M | [config](/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-241016b2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | Swin-S | ImageNet-1k | 80.54 | 94.46 | 80.23 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 94.16 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 4 clips x 3 crop | 166G | 49.8M | [config](/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-e91ab986.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | Swin-B | ImageNet-1k | 80.57 | 94.49 | 80.27 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 94.42 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 4 clips x 3 crop | 282G | 88.0M | [config](/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-182ec6cc.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | Swin-L | ImageNet-22k | 83.46 | 95.91 | 83.1\* | 95.9\* | 4 clips x 3 crop | 604G | 197M | [config](/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-78ad8b11.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | ### Kinetics-700 -| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | gpu_mem(M) | FLOPs | params | config | ckpt | log | -| :---------------------: | :------------: | :--: | :------: | :----------: | :------: | :------: | :---------------: | :--------: | :---: | :----: | :----------------------------: | :--------------------------: | :-------------------------: | -| 32x2x1 | short-side 320 | 16 | Swin-L | ImageNet-22k | 75.26 | 92.44 | 4 clips x 3 crops | 68898 | 604G | 197M | [config](/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb_20220930-f8d74db7.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py.log) | +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :----------: | :------: | :------: | :--------------: | :---: | :----: | :---------------------------------: | :-------------------------------: | :------------------------------: | +| 32x2x1 | 224x224 | 16 | Swin-L | ImageNet-22k | 75.92 | 92.72 | 4 clips x 3 crop | 604G | 197M | [config](/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb_20220930-f8d74db7.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py.log) | 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. 2. The values in columns named after "reference" are the results got by testing on our dataset, using the checkpoints provided by the author with same model settings. `*` means that the numbers are copied from the paper. 3. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. 4. Pre-trained image models can be downloaded from [Swin Transformer for ImageNet Classification](https://github.com/microsoft/Swin-Transformer#main-results-on-imagenet-with-pretrained-models). -For more details on data preparation, you can refer to [preparing_kinetics](/tools/data/kinetics/README.md). +For more details on data preparation, you can refer to [Kinetics](/tools/data/kinetics/README.md). ## Train @@ -52,7 +52,7 @@ Example: train VideoSwin model on Kinetics-400 dataset in a deterministic option ```shell python tools/train.py configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py \ - --cfg-options randomness.seed=0 randomness.deterministic=True + --seed=0 --deterministic ``` For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). diff --git a/configs/recognition/swin/metafile.yml b/configs/recognition/swin/metafile.yml index b557d6e2ed..0a4cc41cb7 100644 --- a/configs/recognition/swin/metafile.yml +++ b/configs/recognition/swin/metafile.yml @@ -1,9 +1,9 @@ Collections: -- Name: Swin - README: configs/recognition/swin/README.md - Paper: - URL: https://arxiv.org/abs/2106.13230 - Title: 'Video Swin Transformer' + - Name: Swin + README: configs/recognition/swin/README.md + Paper: + URL: https://arxiv.org/abs/2106.13230 + Title: 'Video Swin Transformer' Models: - Name: swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb @@ -16,7 +16,7 @@ Models: FLOPs: 88G Parameters: 28.2M Pretrained: ImageNet-1K - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -24,8 +24,8 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 78.29 - Top 5 Accuracy: 93.58 + Top 1 Accuracy: 78.90 + Top 5 Accuracy: 93.77 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-241016b2.pth @@ -39,7 +39,7 @@ Models: FLOPs: 166G Parameters: 49.8M Pretrained: ImageNet-1K - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -47,8 +47,8 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 80.23 - Top 5 Accuracy: 94.32 + Top 1 Accuracy: 80.54 + Top 5 Accuracy: 94.46 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-e91ab986.pth @@ -62,7 +62,7 @@ Models: FLOPs: 282G Parameters: 88.0M Pretrained: ImageNet-1K - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -70,8 +70,8 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 80.21 - Top 5 Accuracy: 94.32 + Top 1 Accuracy: 80.57 + Top 5 Accuracy: 94.49 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-182ec6cc.pth @@ -85,7 +85,7 @@ Models: FLOPs: 604G Parameters: 197M Pretrained: ImageNet-22K - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -93,8 +93,8 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 83.15 - Top 5 Accuracy: 95.76 + Top 1 Accuracy: 83.46 + Top 5 Accuracy: 95.91 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-78ad8b11.pth @@ -108,7 +108,7 @@ Models: FLOPs: 604G Parameters: 197M Pretrained: ImageNet-22K - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-700 Training Resources: 16 GPUs Modality: RGB @@ -116,7 +116,7 @@ Models: - Dataset: Kinetics-700 Task: Action Recognition Metrics: - Top 1 Accuracy: 75.26 - Top 5 Accuracy: 92.44 + Top 1 Accuracy: 75.92 + Top 5 Accuracy: 92.72 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb_20220930-f8d74db7.pth diff --git a/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py b/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py index ebb304b6a0..8038e3307b 100644 --- a/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py +++ b/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py @@ -19,11 +19,11 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) -file_client_args = dict(io_backend='disk') +file_client_args = dict( + io_backend='petrel', + path_mapping=dict( + {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) +# file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), diff --git a/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py b/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py index 0fbbb465ec..34ffe7a31a 100644 --- a/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py +++ b/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py @@ -11,11 +11,11 @@ ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt' ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics700': 's3://openmmlab/datasets/action/Kinetics700'})) -file_client_args = dict(io_backend='disk') +file_client_args = dict( + io_backend='petrel', + path_mapping=dict( + {'data/kinetics700': 's3://openmmlab/datasets/action/Kinetics700'})) +# file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), diff --git a/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py b/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py index 2696d18c9c..24fe3ec95e 100644 --- a/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py +++ b/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py @@ -19,11 +19,11 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) -file_client_args = dict(io_backend='disk') +file_client_args = dict( + io_backend='petrel', + path_mapping=dict( + {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) +# file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), diff --git a/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py b/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py index 1d6312c224..0c5d784f13 100644 --- a/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py +++ b/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py @@ -18,11 +18,11 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) -file_client_args = dict(io_backend='disk') +file_client_args = dict( + io_backend='petrel', + path_mapping=dict( + {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) +# file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), diff --git a/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py b/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py index f44e1d5e72..eb828311c1 100644 --- a/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py +++ b/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py @@ -16,11 +16,11 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) -file_client_args = dict(io_backend='disk') +file_client_args = dict( + io_backend='petrel', + path_mapping=dict( + {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) +# file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), From 5597defc1a89eb0157922617bf7d9ad5ac70e7a7 Mon Sep 17 00:00:00 2001 From: Dai-Wenxun Date: Thu, 8 Dec 2022 21:53:12 +0800 Subject: [PATCH 07/19] update swin --- configs/recognition/swin/README.md | 6 +++--- ...877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py | 10 +++++----- ...7_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py | 10 +++++----- ...77_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py | 10 +++++----- ...877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py | 10 +++++----- ...877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py | 10 +++++----- 6 files changed, 28 insertions(+), 28 deletions(-) diff --git a/configs/recognition/swin/README.md b/configs/recognition/swin/README.md index c5ad4d51bb..1e6074c4a9 100644 --- a/configs/recognition/swin/README.md +++ b/configs/recognition/swin/README.md @@ -22,9 +22,9 @@ The vision community is witnessing a modeling shift from CNNs to Transformers, w | frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top1 acc | testing protocol | FLOPs | params | config | ckpt | log | | :---------------------: | :--------: | :--: | :------: | :----------: | :------: | :------: | :--------------------------: | :--------------------------: | :--------------: | :---: | :----: | :--------------: | :------------: | :------------: | -| 32x2x1 | 224x224 | 8 | Swin-T | ImageNet-1k | 78.90 | 93.77 | 78.46 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 93.46 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 4 clips x 3 crop | 88G | 28.2M | [config](/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-241016b2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | -| 32x2x1 | 224x224 | 8 | Swin-S | ImageNet-1k | 80.54 | 94.46 | 80.23 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 94.16 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 4 clips x 3 crop | 166G | 49.8M | [config](/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-e91ab986.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | -| 32x2x1 | 224x224 | 8 | Swin-B | ImageNet-1k | 80.57 | 94.49 | 80.27 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 94.42 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 4 clips x 3 crop | 282G | 88.0M | [config](/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-182ec6cc.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | Swin-T | ImageNet-1k | 78.90 | 93.77 | 78.84 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 93.76 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 4 clips x 3 crop | 88G | 28.2M | [config](/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-241016b2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | Swin-S | ImageNet-1k | 80.54 | 94.46 | 80.58 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 94.45 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 4 clips x 3 crop | 166G | 49.8M | [config](/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-e91ab986.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | Swin-B | ImageNet-1k | 80.57 | 94.49 | 80.55 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 94.66 \[[VideoSwin](https://github.com/SwinTransformer/Video-Swin-Transformer)\] | 4 clips x 3 crop | 282G | 88.0M | [config](/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-182ec6cc.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | | 32x2x1 | 224x224 | 8 | Swin-L | ImageNet-22k | 83.46 | 95.91 | 83.1\* | 95.9\* | 4 clips x 3 crop | 604G | 197M | [config](/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-78ad8b11.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.log) | ### Kinetics-700 diff --git a/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py b/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py index 8038e3307b..ebb304b6a0 100644 --- a/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py +++ b/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py @@ -19,11 +19,11 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -file_client_args = dict( - io_backend='petrel', - path_mapping=dict( - {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) -# file_client_args = dict(io_backend='disk') +# file_client_args = dict( +# io_backend='petrel', +# path_mapping=dict( +# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) +file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), diff --git a/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py b/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py index 34ffe7a31a..0fbbb465ec 100644 --- a/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py +++ b/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py @@ -11,11 +11,11 @@ ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt' ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt' -file_client_args = dict( - io_backend='petrel', - path_mapping=dict( - {'data/kinetics700': 's3://openmmlab/datasets/action/Kinetics700'})) -# file_client_args = dict(io_backend='disk') +# file_client_args = dict( +# io_backend='petrel', +# path_mapping=dict( +# {'data/kinetics700': 's3://openmmlab/datasets/action/Kinetics700'})) +file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), diff --git a/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py b/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py index 24fe3ec95e..2696d18c9c 100644 --- a/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py +++ b/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py @@ -19,11 +19,11 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -file_client_args = dict( - io_backend='petrel', - path_mapping=dict( - {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) -# file_client_args = dict(io_backend='disk') +# file_client_args = dict( +# io_backend='petrel', +# path_mapping=dict( +# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) +file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), diff --git a/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py b/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py index 0c5d784f13..1d6312c224 100644 --- a/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py +++ b/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py @@ -18,11 +18,11 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -file_client_args = dict( - io_backend='petrel', - path_mapping=dict( - {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) -# file_client_args = dict(io_backend='disk') +# file_client_args = dict( +# io_backend='petrel', +# path_mapping=dict( +# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) +file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), diff --git a/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py b/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py index eb828311c1..f44e1d5e72 100644 --- a/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py +++ b/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py @@ -16,11 +16,11 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -file_client_args = dict( - io_backend='petrel', - path_mapping=dict( - {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) -# file_client_args = dict(io_backend='disk') +# file_client_args = dict( +# io_backend='petrel', +# path_mapping=dict( +# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) +file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), From 7e817055b1f27db91d040aac2566233fb0bbeb21 Mon Sep 17 00:00:00 2001 From: Dai-Wenxun Date: Fri, 9 Dec 2022 15:33:33 +0800 Subject: [PATCH 08/19] update r2plus1d --- configs/recognition/r2plus1d/README.md | 18 +++++++-------- configs/recognition/r2plus1d/metafile.yml | 22 +++++++++---------- ...1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.py | 12 +++++++--- ...s1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py | 18 ++++++++++++--- 4 files changed, 43 insertions(+), 27 deletions(-) diff --git a/configs/recognition/r2plus1d/README.md b/configs/recognition/r2plus1d/README.md index beb4d0ffa7..29a619e696 100644 --- a/configs/recognition/r2plus1d/README.md +++ b/configs/recognition/r2plus1d/README.md @@ -20,17 +20,15 @@ In this paper we discuss several forms of spatiotemporal convolutions for video ### Kinetics-400 -| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | inference time(video/s) | gpu_mem(M) | config | ckpt | log | -| :---------------------: | :------------: | :--: | :------: | :------: | :------: | :------: | :---------------: | :---------------------: | :--------: | :-------------------------: | :-----------------------: | :-----------------------: | -| 8x8x1 | short-side 320 | 8 | ResNet34 | None | 69.35 | 88.32 | 10 clips x 3 crop | x | 5036 | [config](/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb_20220812-47cfe041.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.log) | -| 32x2x1 | short-side 320 | 8 | ResNet34 | None | 75.27 | 92.03 | 10 clips x 3 crop | x | 17006 | [config](/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v2.0/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb_20220812-4270588c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.log) | - -1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default. - According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU, - e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu. +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :---------------: | :---: | :----: | :----------------------------------: | :--------------------------------: | :-------------------------------: | +| 8x8x1 | 224x224 | 8 | ResNet34 | None | 69.76 | 88.41 | 10 clips x 3 crop | 53.1G | 63.8M | [config](/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb_20220812-47cfe041.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.log) | +| 32x2x1 | 224x224 | 8 | ResNet34 | None | 75.46 | 92.28 | 10 clips x 3 crop | 213G | 63.8M | [config](/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb_20220812-4270588c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.log) | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. 2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. -For more details on data preparation, you can refer to the **Prepare videos** part in the [Data Preparation Tutorial](/docs/en/user_guides/2_data_prepare.md). +For more details on data preparation, you can refer to [Kinetics400](/tools/data/kinetics/README.md). ## Train @@ -44,7 +42,7 @@ Example: train R(2+1)D model on Kinetics-400 dataset in a deterministic option. ```shell python tools/train.py configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py \ - --cfg-options randomness.seed=0 randomness.deterministic=True + --seed=0 --deterministic ``` For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). diff --git a/configs/recognition/r2plus1d/metafile.yml b/configs/recognition/r2plus1d/metafile.yml index e89c475037..376687b031 100644 --- a/configs/recognition/r2plus1d/metafile.yml +++ b/configs/recognition/r2plus1d/metafile.yml @@ -13,10 +13,10 @@ Models: Architecture: ResNet34 Batch Size: 8 Epochs: 180 - FLOPs: 53175572992 - Parameters: 63759281 + FLOPs: 53.1G + Parameters: 63.8M Pretrained: None - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -24,8 +24,8 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 69.35 - Top 5 Accuracy: 88.32 + Top 1 Accuracy: 69.76 + Top 5 Accuracy: 88.41 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb_20220812-47cfe041.pth @@ -36,10 +36,10 @@ Models: Architecture: ResNet34 Batch Size: 8 Epochs: 180 - FLOPs: 212701677568 - Parameters: 63759281 + FLOPs: 213G + Parameters: 63.8M Pretrained: None - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -47,7 +47,7 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 75.27 - Top 5 Accuracy: 92.03 + Top 1 Accuracy: 75.46 + Top 5 Accuracy: 92.28 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.log - Weights: https://download.openmmlab.com/mmaction/v2.0/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb_20220812-4270588c.pth + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb_20220812-4270588c.pth diff --git a/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.py b/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.py index 4815cb7451..61ebe26cd9 100644 --- a/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.py +++ b/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.py @@ -7,8 +7,14 @@ ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +# file_client_args = dict( +# io_backend='petrel', +# path_mapping=dict( +# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) +file_client_args = dict(io_backend='disk') train_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), @@ -19,7 +25,7 @@ dict(type='PackActionInputs') ] val_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='SampleFrames', clip_len=32, @@ -33,7 +39,7 @@ dict(type='PackActionInputs') ] test_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='SampleFrames', clip_len=32, diff --git a/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py b/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py index d7e3d93d94..8aa68449d2 100644 --- a/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py +++ b/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py @@ -9,8 +9,14 @@ ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +# file_client_args = dict( +# io_backend='petrel', +# path_mapping=dict( +# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) +file_client_args = dict(io_backend='disk') train_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), @@ -21,7 +27,7 @@ dict(type='PackActionInputs') ] val_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='SampleFrames', clip_len=8, @@ -35,7 +41,7 @@ dict(type='PackActionInputs') ] test_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='SampleFrames', clip_len=8, @@ -103,3 +109,9 @@ ] default_hooks = dict(checkpoint=dict(max_keep_ckpts=3)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) From 63bf4a6cfc73b5c7b5d16193a8508bb097435673 Mon Sep 17 00:00:00 2001 From: Dai-Wenxun Date: Fri, 9 Dec 2022 21:53:11 +0800 Subject: [PATCH 09/19] update tanet --- configs/recognition/tanet/README.md | 24 +++++----- configs/recognition/tanet/metafile.yml | 44 ++++++++++--------- ...retrained-r50_8xb6-1x1x16-50e_sthv1-rgb.py | 19 ++++++-- ...pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.py | 17 +++++-- ...0_8xb8-dense-1x1x8-100e_kinetics400-rgb.py | 17 +++++-- 5 files changed, 77 insertions(+), 44 deletions(-) diff --git a/configs/recognition/tanet/README.md b/configs/recognition/tanet/README.md index 77635a3120..4b5170e5fc 100644 --- a/configs/recognition/tanet/README.md +++ b/configs/recognition/tanet/README.md @@ -20,27 +20,25 @@ Video data is with complex temporal dynamics due to various factors such as came ### Kinetics-400 -| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | inference time(video/s) | gpu_mem(M) | config | ckpt | log | -| :---------------------: | :------------: | :--: | :------: | :------: | :------: | :------: | :---------------------: | :---------------------: | :---------------: | :---------------------: | :--------: | :----------: | :--------: | :-------: | -| dense-1x1x8 | short-side 320 | 8 | ResNet50 | ImageNet | 76.25 | 92.41 | [76.22](https://github.com/liu-zhy/temporal-adaptive-module/blob/master/scripts/test_tam_kinetics_rgb_8f.sh) | [92.53](https://github.com/liu-zhy/temporal-adaptive-module/blob/master/scripts/test_tam_kinetics_rgb_8f.sh) | 80 clips x 3 crop | x | 7627 | [config](/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb_20220919-a34346bc.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.log) | +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :--------------------------: | :---------------------------: | :---------------: | :---: | :----: | :---------------: | :-------------: | :------------: | +| dense-1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 76.25 | 92.41 | [76.22](https://github.com/liu-zhy/temporal-adaptive-module/blob/master/scripts/test_tam_kinetics_rgb_8f.sh) | [92.53](https://github.com/liu-zhy/temporal-adaptive-module/blob/master/scripts/test_tam_kinetics_rgb_8f.sh) | 10 clips x 3 crop | 43.0G | 25.6M | [config](/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb_20220919-a34346bc.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.log) | ### Something-Something V1 -| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc (efficient/accurate) | top5 acc (efficient/accurate) | testing protocol | inference time(video/s) | gpu_mem(M) | config | ckpt | log | -| :---------------------: | :--------: | :--: | :------: | :------: | :---------------------------: | :---------------------------: | :--------------: | :---------------------: | :--------: | :-------------: | :-----------: | :----------: | -| 1x1x8 | height 100 | 8 | ResNet50 | ImageNet | 46.98/49.71 | 75.75/77.43 | 8 clips x 3 crop | x | 7116 | [config](/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb_20220906-de50e4ef.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.log) | -| 1x1x16 | height 100 | 8 | ResNet50 | ImageNet | 48.24/50.95 | 78.16/79.28 | 8 clips x 3 crop | x | 10464 | [config](/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb_20220919-cc37e9b8.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.log) | +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :------: | :---------: | :---------: | :---------------: | :---: | :----: | :--------------------------------: | :------------------------------: | :-----------------------------: | +| 1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 46.13/49.00 | 75.06/77.22 | 16 clips x 3 crop | 43.1G | 25.1M | [config](/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb_20220906-de50e4ef.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.log) | +| 1x1x16 | 224x224 | 8 | ResNet50 | ImageNet | 48.52/51.02 | 78.61/79.83 | 16 clips x 3 crop | 86.1G | 25.1M | [config](/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb_20220919-cc37e9b8.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.log) | -1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default. - According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU, - e.g., lr=0.01 for 8 GPUs x 8 videos/gpu and lr=0.04 for 16 GPUs x 16 videos/gpu. +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. 2. The values in columns named after "reference" are the results got by testing on our dataset, using the checkpoints provided by the author with same model settings. The checkpoints for reference repo can be downloaded [here](https://drive.google.com/drive/folders/1sFfmP3yrfc7IzRshEELOby7-aEoymIFL?usp=sharing). 3. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/v1.0/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/v1.0/dataset/k400_val/kinetics_class2ind.txt) are also available. For more details on data preparation, you can refer to -- [preparing_kinetics](/tools/data/kinetics/README.md) -- [preparing_sthv1](/tools/data/sthv1/README.md) +- [Kinetics](/tools/data/kinetics/README.md) +- [Something-something V1](/tools/data/sthv1/README.md) ## Train @@ -54,7 +52,7 @@ Example: train TANet model on Kinetics-400 dataset in a deterministic option wit ```shell python tools/train.py configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py \ - --cfg-options randomness.seed=0 randomness.deterministic=True + --seed=0 --deterministic ``` For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). diff --git a/configs/recognition/tanet/metafile.yml b/configs/recognition/tanet/metafile.yml index 730540ca98..12d0ffd1e2 100644 --- a/configs/recognition/tanet/metafile.yml +++ b/configs/recognition/tanet/metafile.yml @@ -10,13 +10,13 @@ Models: Config: configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py In Collection: TANet Metadata: - Architecture: TANet + Architecture: ResNet50 Batch Size: 8 Epochs: 100 - FLOPs: 43065983104 - Parameters: 25590320 + FLOPs: 43.0G + Parameters: 25.6M Pretrained: ImageNet - Resolution: short-side 320 + Resolution: 224x224 Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -28,17 +28,18 @@ Models: Top 5 Accuracy: 92.41 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb_20220919-a34346bc.pth + - Name: tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb Config: configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.py In Collection: TANet Metadata: - Architecture: TANet + Architecture: ResNet50 Batch Size: 8 Epochs: 50 - FLOPs: 32972787840 - Parameters: 25127246 + FLOPs: 43.1G + Parameters: 25.1M Pretrained: ImageNet - Resolution: height 100 + Resolution: 224x224 Training Data: SthV1 Training Resources: 8 GPUs Modality: RGB @@ -46,23 +47,24 @@ Models: - Dataset: SthV1 Task: Action Recognition Metrics: - Top 1 Accuracy: 49.71 - Top 1 Accuracy (efficient): 46.98 - Top 5 Accuracy: 77.43 - Top 5 Accuracy (efficient): 75.75 + Top 1 Accuracy: 49.00 + Top 1 Accuracy (efficient): 46.13 + Top 5 Accuracy: 77.22 + Top 5 Accuracy (efficient): 75.06 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb_20220906-de50e4ef.pth + - Name: tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb Config: configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.py In Collection: TANet Metadata: - Architecture: TANet - Batch Size: 8 + Architecture: ResNet50 + Batch Size: 6 Epochs: 50 - FLOPs: 65946542336 - Parameters: 25134670 + FLOPs: 86.1G + Parameters: 25.1M Pretrained: ImageNet - Resolution: height 100 + Resolution: 224x224 Training Data: SthV1 Training Resources: 8 GPUs Modality: RGB @@ -70,9 +72,9 @@ Models: - Dataset: SthV1 Task: Action Recognition Metrics: - Top 1 Accuracy: 50.95 - Top 1 Accuracy (efficient): 48.24 - Top 5 Accuracy: 79.28 - Top 5 Accuracy (efficient): 78.16 + Top 1 Accuracy: 51.02 + Top 1 Accuracy (efficient): 48.52 + Top 5 Accuracy: 79.83 + Top 5 Accuracy (efficient): 78.61 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb_20220919-cc37e9b8.pth diff --git a/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.py b/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.py index 0c9ed640c6..ec3a6b6817 100644 --- a/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.py +++ b/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.py @@ -17,9 +17,14 @@ ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52} +# file_client_args = dict( +# io_backend='petrel', +# path_mapping=dict( +# {'data/sthv1': 's3://openmmlab/datasets/action/sthv1'})) +file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16), - dict(type='RawFrameDecode'), + dict(type='RawFrameDecode', **file_client_args), dict(type='Resize', scale=(-1, 256)), dict( type='MultiScaleCrop', @@ -40,7 +45,7 @@ frame_interval=1, num_clips=16, test_mode=True), - dict(type='RawFrameDecode'), + dict(type='RawFrameDecode', **file_client_args), dict(type='Resize', scale=(-1, 256)), dict(type='CenterCrop', crop_size=224), dict(type='FormatShape', input_format='NCHW'), @@ -54,13 +59,13 @@ num_clips=16, twice_sample=True, test_mode=True), - dict(type='RawFrameDecode'), + dict(type='RawFrameDecode', **file_client_args), dict(type='Resize', scale=(-1, 256)), dict(type='ThreeCrop', crop_size=256), dict(type='FormatShape', input_format='NCHW'), dict(type='PackActionInputs') ] - +test_pipeline = val_pipeline train_dataloader = dict( batch_size=6, num_workers=8, @@ -113,3 +118,9 @@ ] default_hooks = dict(checkpoint=dict(max_keep_ckpts=3)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (6 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=48) diff --git a/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.py b/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.py index 173b0e88c5..5e309cdd15 100644 --- a/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.py +++ b/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.py @@ -15,9 +15,14 @@ ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52} +# file_client_args = dict( +# io_backend='petrel', +# path_mapping=dict( +# {'data/sthv1': 's3://openmmlab/datasets/action/sthv1'})) +file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), - dict(type='RawFrameDecode'), + dict(type='RawFrameDecode', **file_client_args), dict(type='Resize', scale=(-1, 256)), dict( type='MultiScaleCrop', @@ -38,7 +43,7 @@ frame_interval=1, num_clips=8, test_mode=True), - dict(type='RawFrameDecode'), + dict(type='RawFrameDecode', **file_client_args), dict(type='Resize', scale=(-1, 256)), dict(type='CenterCrop', crop_size=224), dict(type='FormatShape', input_format='NCHW'), @@ -52,7 +57,7 @@ num_clips=8, twice_sample=True, test_mode=True), - dict(type='RawFrameDecode'), + dict(type='RawFrameDecode', **file_client_args), dict(type='Resize', scale=(-1, 256)), dict(type='ThreeCrop', crop_size=256), dict(type='FormatShape', input_format='NCHW'), @@ -111,3 +116,9 @@ ] default_hooks = dict(checkpoint=dict(max_keep_ckpts=3)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py b/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py index d99f3de90e..312387535f 100644 --- a/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py +++ b/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py @@ -11,8 +11,13 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' +# file_client_args = dict( +# io_backend='petrel', +# path_mapping=dict( +# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) +file_client_args = dict(io_backend='disk') train_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict(type='DenseSampleFrames', clip_len=1, frame_interval=1, num_clips=8), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), @@ -29,7 +34,7 @@ dict(type='PackActionInputs') ] val_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='DenseSampleFrames', clip_len=1, @@ -43,7 +48,7 @@ dict(type='PackActionInputs') ] test_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='DenseSampleFrames', clip_len=1, @@ -107,3 +112,9 @@ default_hooks = dict( checkpoint=dict(max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) From f1ab6a23a55003b86701baa8171adf40c5af31dc Mon Sep 17 00:00:00 2001 From: wxDai Date: Mon, 12 Dec 2022 10:44:54 +0800 Subject: [PATCH 10/19] Update configs/recognition/slowfast/README.md --- configs/recognition/slowfast/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/recognition/slowfast/README.md b/configs/recognition/slowfast/README.md index 762278e416..3bf1666152 100644 --- a/configs/recognition/slowfast/README.md +++ b/configs/recognition/slowfast/README.md @@ -48,7 +48,7 @@ python tools/train.py configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e --seed=0 --deterministic ``` -For more details on data preparation, you can refer to [Kinetics400](/tools/data/kinetics/README.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). ## Test From 0fdd19936115ebef6740016da8cbfc698f703e28 Mon Sep 17 00:00:00 2001 From: Dai-Wenxun Date: Mon, 12 Dec 2022 11:07:22 +0800 Subject: [PATCH 11/19] fix tanet --- configs/recognition/tanet/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/recognition/tanet/README.md b/configs/recognition/tanet/README.md index 4b5170e5fc..f0b77af549 100644 --- a/configs/recognition/tanet/README.md +++ b/configs/recognition/tanet/README.md @@ -20,9 +20,9 @@ Video data is with complex temporal dynamics due to various factors such as came ### Kinetics-400 -| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | FLOPs | params | config | ckpt | log | -| :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :--------------------------: | :---------------------------: | :---------------: | :---: | :----: | :---------------: | :-------------: | :------------: | -| dense-1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 76.25 | 92.41 | [76.22](https://github.com/liu-zhy/temporal-adaptive-module/blob/master/scripts/test_tam_kinetics_rgb_8f.sh) | [92.53](https://github.com/liu-zhy/temporal-adaptive-module/blob/master/scripts/test_tam_kinetics_rgb_8f.sh) | 10 clips x 3 crop | 43.0G | 25.6M | [config](/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb_20220919-a34346bc.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.log) | +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :---------------------------: | :---------------------------: | :--------------: | :---: | :----: | :---------------: | :-------------: | :------------: | +| dense-1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 76.25 | 92.41 | [76.22](https://github.com/liu-zhy/temporal-adaptive-module/blob/master/scripts/test_tam_kinetics_rgb_8f.sh) | [92.53](https://github.com/liu-zhy/temporal-adaptive-module/blob/master/scripts/test_tam_kinetics_rgb_8f.sh) | 8 clips x 3 crop | 43.0G | 25.6M | [config](/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb_20220919-a34346bc.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.log) | ### Something-Something V1 @@ -37,7 +37,7 @@ Video data is with complex temporal dynamics due to various factors such as came For more details on data preparation, you can refer to -- [Kinetics](/tools/data/kinetics/README.md) +- [Kinetics400](/tools/data/kinetics/README.md) - [Something-something V1](/tools/data/sthv1/README.md) ## Train From 7fff7ea3247c13da75682bd4959110c70b711c91 Mon Sep 17 00:00:00 2001 From: Dai-Wenxun Date: Mon, 19 Dec 2022 11:29:46 +0800 Subject: [PATCH 12/19] fix tanet --- configs/recognition/tanet/README.md | 4 ++-- configs/recognition/tanet/metafile.yml | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/configs/recognition/tanet/README.md b/configs/recognition/tanet/README.md index f0b77af549..1a67a40aa0 100644 --- a/configs/recognition/tanet/README.md +++ b/configs/recognition/tanet/README.md @@ -28,8 +28,8 @@ Video data is with complex temporal dynamics due to various factors such as came | frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | | :---------------------: | :--------: | :--: | :------: | :------: | :---------: | :---------: | :---------------: | :---: | :----: | :--------------------------------: | :------------------------------: | :-----------------------------: | -| 1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 46.13/49.00 | 75.06/77.22 | 16 clips x 3 crop | 43.1G | 25.1M | [config](/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb_20220906-de50e4ef.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.log) | -| 1x1x16 | 224x224 | 8 | ResNet50 | ImageNet | 48.52/51.02 | 78.61/79.83 | 16 clips x 3 crop | 86.1G | 25.1M | [config](/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb_20220919-cc37e9b8.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.log) | +| 1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 46.98/49.71 | 75.75/77.43 | 16 clips x 3 crop | 43.1G | 25.1M | [config](/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb_20220906-de50e4ef.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.log) | +| 1x1x16 | 224x224 | 8 | ResNet50 | ImageNet | 48.24/50.95 | 78.16/79.28 | 16 clips x 3 crop | 86.1G | 25.1M | [config](/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb_20220919-cc37e9b8.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.log) | 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. 2. The values in columns named after "reference" are the results got by testing on our dataset, using the checkpoints provided by the author with same model settings. The checkpoints for reference repo can be downloaded [here](https://drive.google.com/drive/folders/1sFfmP3yrfc7IzRshEELOby7-aEoymIFL?usp=sharing). diff --git a/configs/recognition/tanet/metafile.yml b/configs/recognition/tanet/metafile.yml index 12d0ffd1e2..7506a83417 100644 --- a/configs/recognition/tanet/metafile.yml +++ b/configs/recognition/tanet/metafile.yml @@ -47,10 +47,10 @@ Models: - Dataset: SthV1 Task: Action Recognition Metrics: - Top 1 Accuracy: 49.00 - Top 1 Accuracy (efficient): 46.13 - Top 5 Accuracy: 77.22 - Top 5 Accuracy (efficient): 75.06 + Top 1 Accuracy: 49.71 + Top 1 Accuracy (efficient): 46.98 + Top 5 Accuracy: 77.43 + Top 5 Accuracy (efficient): 75.75 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb_20220906-de50e4ef.pth @@ -72,9 +72,9 @@ Models: - Dataset: SthV1 Task: Action Recognition Metrics: - Top 1 Accuracy: 51.02 - Top 1 Accuracy (efficient): 48.52 - Top 5 Accuracy: 79.83 - Top 5 Accuracy (efficient): 78.61 + Top 1 Accuracy: 50.95 + Top 1 Accuracy (efficient): 48.24 + Top 5 Accuracy: 79.28 + Top 5 Accuracy (efficient): 78.16 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb_20220919-cc37e9b8.pth From 8c0d4474b929b8de27a9650ea7dcd708979f3513 Mon Sep 17 00:00:00 2001 From: Dai-Wenxun Date: Mon, 19 Dec 2022 11:31:40 +0800 Subject: [PATCH 13/19] fix c2d --- ...d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py | 5 ----- .../c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb.py | 5 ----- .../c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.py | 5 ----- 3 files changed, 15 deletions(-) diff --git a/configs/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py b/configs/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py index 13795ffc00..cf8ae40e40 100644 --- a/configs/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py +++ b/configs/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py @@ -10,12 +10,7 @@ ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) file_client_args = dict(io_backend='disk') - train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1), diff --git a/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb.py b/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb.py index 4247cd8d9c..515dd621ac 100644 --- a/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb.py +++ b/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-16x4x1-100e_kinetics400-rgb.py @@ -10,12 +10,7 @@ ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) file_client_args = dict(io_backend='disk') - train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=16, frame_interval=4, num_clips=1), diff --git a/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.py b/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.py index b2ca2c707e..135907a8c9 100644 --- a/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.py +++ b/configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_kinetics400-rgb.py @@ -10,12 +10,7 @@ ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) file_client_args = dict(io_backend='disk') - train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1), From de5fcbb56fe2f8df43fc55fe1f167b5bb80f5187 Mon Sep 17 00:00:00 2001 From: Dai-Wenxun Date: Mon, 19 Dec 2022 11:32:11 +0800 Subject: [PATCH 14/19] fix c3d --- .../c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py b/configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py index d89534ae9a..49635fa412 100644 --- a/configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py +++ b/configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py @@ -12,12 +12,7 @@ ann_file_val = f'data/ucf101/ucf101_val_split_{split}_videos.txt' ann_file_test = f'data/ucf101/ucf101_val_split_{split}_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/ucf101': 's3://openmmlab/datasets/action/ucf101'})) file_client_args = dict(io_backend='disk') - train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=16, frame_interval=1, num_clips=1), From 9febe9ea67e3649dd36f8adb1070a0197def594c Mon Sep 17 00:00:00 2001 From: Dai-Wenxun Date: Mon, 19 Dec 2022 11:32:51 +0800 Subject: [PATCH 15/19] fix i3d --- ...ned-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py | 4 ---- ...magenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py | 4 ---- ...t-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py | 4 ---- 3 files changed, 12 deletions(-) diff --git a/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py b/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py index 9c835e34cd..5ec792b3dc 100644 --- a/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py +++ b/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py @@ -21,10 +21,6 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), diff --git a/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py b/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py index 28f23521f1..63b14db296 100644 --- a/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py +++ b/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py @@ -11,10 +11,6 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), diff --git a/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py b/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py index 57e592ca8d..a8593e221c 100644 --- a/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py +++ b/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-dense-32x2x1-100e_kinetics400-rgb.py @@ -8,10 +8,6 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), From ed806e443684d114358a50964ce0706e79032de7 Mon Sep 17 00:00:00 2001 From: Dai-Wenxun Date: Mon, 19 Dec 2022 11:33:17 +0800 Subject: [PATCH 16/19] fix r2+1d --- .../r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.py | 4 ---- .../r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py | 4 ---- 2 files changed, 8 deletions(-) diff --git a/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.py b/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.py index 61ebe26cd9..ddd7b1c2f8 100644 --- a/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.py +++ b/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.py @@ -8,10 +8,6 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), diff --git a/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py b/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py index 8aa68449d2..ab28168ab9 100644 --- a/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py +++ b/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_kinetics400-rgb.py @@ -10,10 +10,6 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), From e07f6119fa726953a8beec377ef6dbdf3ed91278 Mon Sep 17 00:00:00 2001 From: Dai-Wenxun Date: Mon, 19 Dec 2022 11:33:45 +0800 Subject: [PATCH 17/19] fix slowfast --- .../slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py b/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py index e6698ddb9c..7c3c0a66ef 100644 --- a/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py +++ b/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py @@ -9,10 +9,6 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), From ff74080812b37655e7e9bdb76e6e740287d9392e Mon Sep 17 00:00:00 2001 From: Dai-Wenxun Date: Mon, 19 Dec 2022 11:34:26 +0800 Subject: [PATCH 18/19] fix swin --- ...-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py | 4 ---- ...244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py | 4 ---- ...p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py | 4 ---- ...-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py | 4 ---- ...-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py | 4 ---- 5 files changed, 20 deletions(-) diff --git a/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py b/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py index ebb304b6a0..1e9874d132 100644 --- a/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py +++ b/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py @@ -19,10 +19,6 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), diff --git a/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py b/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py index 0fbbb465ec..b5c3e57150 100644 --- a/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py +++ b/configs/recognition/swin/swin-large-p244-w877_in22k-pre_16xb8-amp-32x2x1-30e_kinetics700-rgb.py @@ -11,10 +11,6 @@ ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt' ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics700': 's3://openmmlab/datasets/action/Kinetics700'})) file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), diff --git a/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py b/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py index 2696d18c9c..a1dffd1d06 100644 --- a/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py +++ b/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py @@ -19,10 +19,6 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), diff --git a/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py b/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py index 1d6312c224..09df2e6c4f 100644 --- a/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py +++ b/configs/recognition/swin/swin-small-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py @@ -18,10 +18,6 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), diff --git a/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py b/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py index f44e1d5e72..e5b14fc8fc 100644 --- a/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py +++ b/configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py @@ -16,10 +16,6 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args), From f874ec339a55e9278c98f02eb9a748dcebb33bf6 Mon Sep 17 00:00:00 2001 From: Dai-Wenxun Date: Mon, 19 Dec 2022 11:34:52 +0800 Subject: [PATCH 19/19] fix tanet --- ...tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.py | 4 ---- .../tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.py | 4 ---- ...et-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py | 4 ---- 3 files changed, 12 deletions(-) diff --git a/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.py b/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.py index ec3a6b6817..bad33feae1 100644 --- a/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.py +++ b/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb6-1x1x16-50e_sthv1-rgb.py @@ -17,10 +17,6 @@ ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52} -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/sthv1': 's3://openmmlab/datasets/action/sthv1'})) file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16), diff --git a/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.py b/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.py index 5e309cdd15..2831be7524 100644 --- a/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.py +++ b/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-1x1x8-50e_sthv1-rgb.py @@ -15,10 +15,6 @@ ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52} -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/sthv1': 's3://openmmlab/datasets/action/sthv1'})) file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), diff --git a/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py b/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py index 312387535f..3f503090b8 100644 --- a/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py +++ b/configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb8-dense-1x1x8-100e_kinetics400-rgb.py @@ -11,10 +11,6 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='DecordInit', **file_client_args),