From 74a66d86c1ca1028e7b3287bfaee8e36b26a9c6f Mon Sep 17 00:00:00 2001 From: KaiHoo Date: Thu, 8 Dec 2022 00:59:15 -0500 Subject: [PATCH 1/6] update README --- ...8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py | 2 +- configs/recognition/timesformer/README.md | 20 +++++++++---------- configs/recognition/timesformer/metafile.yml | 12 ++++++++--- ...aceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py | 18 ++++++++++++++--- 4 files changed, 34 insertions(+), 18 deletions(-) diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py b/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py index fa65298d8d..4d4a3dea6b 100644 --- a/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py +++ b/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py @@ -49,7 +49,7 @@ # The testing is w/o. any cropping / flipping val_pipeline = [ dict( - type='SampleAVAFrames', clip_len=16, frame_interval=4, test_mode=True), + type='SampleAVAFrames', clip_len=16, frame_interval=4, test_mode=True), dict(type='RawFrameDecode', **file_client_args), dict(type='Resize', scale=(-1, 256)), dict(type='FormatShape', input_format='NCTHW', collapse=True), diff --git a/configs/recognition/timesformer/README.md b/configs/recognition/timesformer/README.md index 8b3fdf2c30..c7f5f439db 100644 --- a/configs/recognition/timesformer/README.md +++ b/configs/recognition/timesformer/README.md @@ -20,19 +20,17 @@ We present a convolution-free approach to video classification built exclusively ### Kinetics-400 -| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | inference_time(video/s) | gpu_mem(M) | config | ckpt | log | -| :---------------------: | :------------: | :--: | :---------------------: | :----------: | :------: | :------: | :---------------------: | :--------: | :------------------------: | :-----------------------: | :----------------------: | -| 8x32x1 | short-side 320 | 8 | TimeSformer (divST) | ImageNet-21K | 77.96 | 93.57 | x | 15235 | [config](/configs/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-a4d0d01f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.log) | -| 8x32x1 | short-side 320 | 8 | TimeSformer (jointST) | ImageNet-21K | 76.93 | 93.27 | x | 33358 | [config](/configs/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-8022d1c0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.log) | -| 8x32x1 | short-side 320 | 8 | TimeSformer (spaceOnly) | ImageNet-21K | 76.98 | 92.83 | x | 12355 | [config](/configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb_20220815-78f05367.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.log) | - -1. The **gpus** indicates the number of gpu (80G A100) we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default. - According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU, - e.g., lr=0.005 for 8 GPUs x 8 videos/gpu and lr=0.00375 for 8 GPUs x 6 videos/gpu. +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :---------------------: | :----------: | :------: | :------: | :---: | :----: | :---------------------------------: | :-------------------------------: | :-------------------------------: | +| 8x32x1 | 224x224 | 8 | TimeSformer (divST) | ImageNet-21K | 77.69 | 93.45 | 588G | 122M | [config](/configs/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-a4d0d01f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.log) | +| 8x32x1 | 224x224 | 8 | TimeSformer (jointST) | ImageNet-21K | 76.95 | 93.28 | 539G | 86.11M | [config](/configs/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-8022d1c0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.log) | +| 8x32x1 | 224x224 | 8 | TimeSformer (spaceOnly) | ImageNet-21K | 76.93 | 92.88 | 422G | 86.11M | [config](/configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb_20220815-78f05367.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.log) | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. 2. We keep the test setting with the [original repo](https://github.com/facebookresearch/TimeSformer) (three crop x 1 clip). 3. The pretrained model `vit_base_patch16_224.pth` used by TimeSformer was converted from [vision_transformer](https://github.com/google-research/vision_transformer). -For more details on data preparation, you can refer to the **Prepare videos** part in the [Data Preparation Tutorial](/docs/en/user_guides/2_data_prepare.md). +For more details on data preparation, you can refer to [Kinetics400](/tools/data/kinetics/README.md). ## Train @@ -46,7 +44,7 @@ Example: train TimeSformer model on Kinetics-400 dataset in a deterministic opti ```shell python tools/train.py configs/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.py \ - --cfg-options randomness.seed=0 randomness.deterministic=True + --seed=0 --deterministic ``` For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). diff --git a/configs/recognition/timesformer/metafile.yml b/configs/recognition/timesformer/metafile.yml index 7f7edd40eb..3226870a67 100644 --- a/configs/recognition/timesformer/metafile.yml +++ b/configs/recognition/timesformer/metafile.yml @@ -14,7 +14,9 @@ Models: Batch Size: 8 Epochs: 15 Pretrained: ImageNet-21K - Resolution: short-side 320 + Resolution: 224x224 + FLOPs: 588G + params: 122M Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -35,7 +37,9 @@ Models: Batch Size: 8 Epochs: 15 Pretrained: ImageNet-21K - Resolution: short-side 320 + Resolution: 224x224 + FLOPs: 539G + params: 86.11M Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB @@ -56,7 +60,9 @@ Models: Batch Size: 8 Epochs: 15 Pretrained: ImageNet-21K - Resolution: short-side 320 + Resolution: 224x224 + FLOPs: 422G + params: 86.11M Training Data: Kinetics-400 Training Resources: 8 GPUs Modality: RGB diff --git a/configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py b/configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py index b969a33d0e..2b1bc559e5 100644 --- a/configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py +++ b/configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py @@ -35,8 +35,14 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' +# file_client_args = dict( +# io_backend='petrel', +# path_mapping=dict( +# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) +file_client_args = dict(io_backend='disk') + train_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=8, frame_interval=32, num_clips=1), dict(type='DecordDecode'), dict(type='RandomRescale', scale_range=(256, 320)), @@ -46,7 +52,7 @@ dict(type='PackActionInputs') ] val_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='SampleFrames', clip_len=8, @@ -60,7 +66,7 @@ dict(type='PackActionInputs') ] test_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='SampleFrames', clip_len=8, @@ -136,3 +142,9 @@ ] default_hooks = dict(checkpoint=dict(interval=5)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) From 39815949635580e67f3272f07f8da39041eb1f91 Mon Sep 17 00:00:00 2001 From: KaiHoo Date: Thu, 8 Dec 2022 19:11:32 -0500 Subject: [PATCH 2/6] update README --- configs/recognition/timesformer/metafile.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/configs/recognition/timesformer/metafile.yml b/configs/recognition/timesformer/metafile.yml index 3226870a67..83e4e64dfe 100644 --- a/configs/recognition/timesformer/metafile.yml +++ b/configs/recognition/timesformer/metafile.yml @@ -24,8 +24,8 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 77.96 - Top 5 Accuracy: 93.57 + Top 1 Accuracy: 77.69 + Top 5 Accuracy: 93.45 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-a4d0d01f.pth @@ -47,8 +47,8 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 76.93 - Top 5 Accuracy: 93.27 + Top 1 Accuracy: 76.95 + Top 5 Accuracy: 93.28 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-8022d1c0.pth @@ -70,7 +70,7 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 76.98 - Top 5 Accuracy: 92.83 + Top 1 Accuracy: 76.93 + Top 5 Accuracy: 92.88 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb_20220815-78f05367.pth From 576623e60c57292a136f86d94d99596ff45426ea Mon Sep 17 00:00:00 2001 From: KaiHoo Date: Fri, 9 Dec 2022 00:44:55 -0500 Subject: [PATCH 3/6] update README --- configs/recognition/timesformer/README.md | 10 +++++----- configs/recognition/timesformer/metafile.yml | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/configs/recognition/timesformer/README.md b/configs/recognition/timesformer/README.md index c7f5f439db..563feb0cb8 100644 --- a/configs/recognition/timesformer/README.md +++ b/configs/recognition/timesformer/README.md @@ -20,11 +20,11 @@ We present a convolution-free approach to video classification built exclusively ### Kinetics-400 -| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | FLOPs | params | config | ckpt | log | -| :---------------------: | :--------: | :--: | :---------------------: | :----------: | :------: | :------: | :---: | :----: | :---------------------------------: | :-------------------------------: | :-------------------------------: | -| 8x32x1 | 224x224 | 8 | TimeSformer (divST) | ImageNet-21K | 77.69 | 93.45 | 588G | 122M | [config](/configs/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-a4d0d01f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.log) | -| 8x32x1 | 224x224 | 8 | TimeSformer (jointST) | ImageNet-21K | 76.95 | 93.28 | 539G | 86.11M | [config](/configs/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-8022d1c0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.log) | -| 8x32x1 | 224x224 | 8 | TimeSformer (spaceOnly) | ImageNet-21K | 76.93 | 92.88 | 422G | 86.11M | [config](/configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb_20220815-78f05367.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.log) | +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :---------------------: | :----------: | :------: | :------: | :---: | :----: | :---------------------------------: | :-------------------------------: | :-------------------------------: |:---: |:---: | +| 8x32x1 | 224x224 | 8 | TimeSformer (divST) | ImageNet-21K | 77.69 | 93.45 | 1 clips x 3 crop |196G | 122M | [config](/configs/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-a4d0d01f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.log) | +| 8x32x1 | 224x224 | 8 | TimeSformer (jointST) | ImageNet-21K | 76.95 | 93.28 |1 clips x 3 crop | 180G | 86.11M | [config](/configs/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-8022d1c0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.log) | +| 8x32x1 | 224x224 | 8 | TimeSformer (spaceOnly) | ImageNet-21K | 76.93 | 92.88 | 1 clips x 3 crop |141G | 86.11M | [config](/configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb_20220815-78f05367.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.log) | 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. 2. We keep the test setting with the [original repo](https://github.com/facebookresearch/TimeSformer) (three crop x 1 clip). diff --git a/configs/recognition/timesformer/metafile.yml b/configs/recognition/timesformer/metafile.yml index 83e4e64dfe..f144b647e3 100644 --- a/configs/recognition/timesformer/metafile.yml +++ b/configs/recognition/timesformer/metafile.yml @@ -15,7 +15,7 @@ Models: Epochs: 15 Pretrained: ImageNet-21K Resolution: 224x224 - FLOPs: 588G + FLOPs: 196G params: 122M Training Data: Kinetics-400 Training Resources: 8 GPUs @@ -38,7 +38,7 @@ Models: Epochs: 15 Pretrained: ImageNet-21K Resolution: 224x224 - FLOPs: 539G + FLOPs: 180G params: 86.11M Training Data: Kinetics-400 Training Resources: 8 GPUs @@ -61,7 +61,7 @@ Models: Epochs: 15 Pretrained: ImageNet-21K Resolution: 224x224 - FLOPs: 422G + FLOPs: 141G params: 86.11M Training Data: Kinetics-400 Training Resources: 8 GPUs From 918d1acb901e9ab2ce138359d1bae04c7c75cb90 Mon Sep 17 00:00:00 2001 From: KaiHoo Date: Fri, 9 Dec 2022 00:47:39 -0500 Subject: [PATCH 4/6] update README --- configs/recognition/timesformer/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/configs/recognition/timesformer/README.md b/configs/recognition/timesformer/README.md index 563feb0cb8..91de635b53 100644 --- a/configs/recognition/timesformer/README.md +++ b/configs/recognition/timesformer/README.md @@ -20,11 +20,11 @@ We present a convolution-free approach to video classification built exclusively ### Kinetics-400 -| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | -| :---------------------: | :--------: | :--: | :---------------------: | :----------: | :------: | :------: | :---: | :----: | :---------------------------------: | :-------------------------------: | :-------------------------------: |:---: |:---: | -| 8x32x1 | 224x224 | 8 | TimeSformer (divST) | ImageNet-21K | 77.69 | 93.45 | 1 clips x 3 crop |196G | 122M | [config](/configs/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-a4d0d01f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.log) | -| 8x32x1 | 224x224 | 8 | TimeSformer (jointST) | ImageNet-21K | 76.95 | 93.28 |1 clips x 3 crop | 180G | 86.11M | [config](/configs/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-8022d1c0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.log) | -| 8x32x1 | 224x224 | 8 | TimeSformer (spaceOnly) | ImageNet-21K | 76.93 | 92.88 | 1 clips x 3 crop |141G | 86.11M | [config](/configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb_20220815-78f05367.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.log) | +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :---------------------: | :----------: | :------: | :------: | :--------------: | :---: | :----: | :----------------------------: | :--------------------------: | :-------------------------: | +| 8x32x1 | 224x224 | 8 | TimeSformer (divST) | ImageNet-21K | 77.69 | 93.45 | 1 clips x 3 crop | 196G | 122M | [config](/configs/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-a4d0d01f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.log) | +| 8x32x1 | 224x224 | 8 | TimeSformer (jointST) | ImageNet-21K | 76.95 | 93.28 | 1 clips x 3 crop | 180G | 86.11M | [config](/configs/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-8022d1c0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.log) | +| 8x32x1 | 224x224 | 8 | TimeSformer (spaceOnly) | ImageNet-21K | 76.93 | 92.88 | 1 clips x 3 crop | 141G | 86.11M | [config](/configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb_20220815-78f05367.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.log) | 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. 2. We keep the test setting with the [original repo](https://github.com/facebookresearch/TimeSformer) (three crop x 1 clip). From c7fcf5a83d8becfb5e950e182e5d6e8ad20cae0b Mon Sep 17 00:00:00 2001 From: KaiHoo Date: Fri, 9 Dec 2022 00:48:18 -0500 Subject: [PATCH 5/6] update README --- configs/recognition/timesformer/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/recognition/timesformer/README.md b/configs/recognition/timesformer/README.md index 91de635b53..df197e0ba9 100644 --- a/configs/recognition/timesformer/README.md +++ b/configs/recognition/timesformer/README.md @@ -22,9 +22,9 @@ We present a convolution-free approach to video classification built exclusively | frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | | :---------------------: | :--------: | :--: | :---------------------: | :----------: | :------: | :------: | :--------------: | :---: | :----: | :----------------------------: | :--------------------------: | :-------------------------: | -| 8x32x1 | 224x224 | 8 | TimeSformer (divST) | ImageNet-21K | 77.69 | 93.45 | 1 clips x 3 crop | 196G | 122M | [config](/configs/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-a4d0d01f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.log) | -| 8x32x1 | 224x224 | 8 | TimeSformer (jointST) | ImageNet-21K | 76.95 | 93.28 | 1 clips x 3 crop | 180G | 86.11M | [config](/configs/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-8022d1c0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.log) | -| 8x32x1 | 224x224 | 8 | TimeSformer (spaceOnly) | ImageNet-21K | 76.93 | 92.88 | 1 clips x 3 crop | 141G | 86.11M | [config](/configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb_20220815-78f05367.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.log) | +| 8x32x1 | 224x224 | 8 | TimeSformer (divST) | ImageNet-21K | 77.69 | 93.45 | 1 clip x 3 crop | 196G | 122M | [config](/configs/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-a4d0d01f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.log) | +| 8x32x1 | 224x224 | 8 | TimeSformer (jointST) | ImageNet-21K | 76.95 | 93.28 | 1 clip x 3 crop | 180G | 86.11M | [config](/configs/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-8022d1c0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_jointST_8xb8-8x32x1-15e_kinetics400-rgb.log) | +| 8x32x1 | 224x224 | 8 | TimeSformer (spaceOnly) | ImageNet-21K | 76.93 | 92.88 | 1 clip x 3 crop | 141G | 86.11M | [config](/configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb_20220815-78f05367.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.log) | 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. 2. We keep the test setting with the [original repo](https://github.com/facebookresearch/TimeSformer) (three crop x 1 clip). From 28179461d740b5b6b27c9aeec5e6056f94a73c66 Mon Sep 17 00:00:00 2001 From: KaiHoo Date: Mon, 12 Dec 2022 19:38:53 -0500 Subject: [PATCH 6/6] rm file args --- .../timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py b/configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py index 2b1bc559e5..e4379bee0c 100644 --- a/configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py +++ b/configs/recognition/timesformer/timesformer_spaceOnly_8xb8-8x32x1-15e_kinetics400-rgb.py @@ -35,10 +35,6 @@ ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict( -# {'data/kinetics400': 's3://openmmlab/datasets/action/Kinetics400'})) file_client_args = dict(io_backend='disk') train_pipeline = [