Skip to content

Remove non-functional Transforms from presets #4952

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions torchvision/prototype/models/video/resnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def _video_resnet(
class R3D_18Weights(Weights):
Kinetics400_RefV1 = WeightEntry(
url="https://download.pytorch.org/models/r3d_18-b3b3357e.pth",
transforms=partial(Kinect400Eval, resize_size=(128, 171), crop_size=(112, 112)),
transforms=partial(Kinect400Eval, crop_size=(112, 112), resize_size=(128, 171)),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing order to match the other presets.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just for appearance, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, just for styling. :)

meta={
**_COMMON_META,
"acc@1": 52.75,
Expand All @@ -74,7 +74,7 @@ class R3D_18Weights(Weights):
class MC3_18Weights(Weights):
Kinetics400_RefV1 = WeightEntry(
url="https://download.pytorch.org/models/mc3_18-a90a0ba3.pth",
transforms=partial(Kinect400Eval, resize_size=(128, 171), crop_size=(112, 112)),
transforms=partial(Kinect400Eval, crop_size=(112, 112), resize_size=(128, 171)),
meta={
**_COMMON_META,
"acc@1": 53.90,
Expand All @@ -86,7 +86,7 @@ class MC3_18Weights(Weights):
class R2Plus1D_18Weights(Weights):
Kinetics400_RefV1 = WeightEntry(
url="https://download.pytorch.org/models/r2plus1d_18-91a641e6.pth",
transforms=partial(Kinect400Eval, resize_size=(128, 171), crop_size=(112, 112)),
transforms=partial(Kinect400Eval, crop_size=(112, 112), resize_size=(128, 171)),
meta={
**_COMMON_META,
"acc@1": 57.50,
Expand Down
44 changes: 24 additions & 20 deletions torchvision/prototype/transforms/_presets.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
import torch
from torch import Tensor, nn

from ... import transforms as T
from ...transforms import functional as F
from ...transforms import functional as F, InterpolationMode


__all__ = ["CocoEval", "ImageNetEval", "Kinect400Eval", "VocEval"]
Expand All @@ -26,42 +25,47 @@ def __init__(
resize_size: int = 256,
mean: Tuple[float, ...] = (0.485, 0.456, 0.406),
std: Tuple[float, ...] = (0.229, 0.224, 0.225),
interpolation: T.InterpolationMode = T.InterpolationMode.BILINEAR,
interpolation: InterpolationMode = InterpolationMode.BILINEAR,
) -> None:
super().__init__()
self._resize = T.Resize(resize_size, interpolation=interpolation)
self._crop = T.CenterCrop(crop_size)
self._normalize = T.Normalize(mean=mean, std=std)
self._crop_size = [crop_size]
self._size = [resize_size]
self._mean = list(mean)
self._std = list(std)
self._interpolation = interpolation

def forward(self, img: Tensor) -> Tensor:
img = self._crop(self._resize(img))
img = F.resize(img, self._size, interpolation=self._interpolation)
img = F.center_crop(img, self._crop_size)
if not isinstance(img, Tensor):
img = F.pil_to_tensor(img)
img = F.convert_image_dtype(img, torch.float)
return self._normalize(img)
img = F.normalize(img, mean=self._mean, std=self._std)
return img
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

replacing with the functional equivalents.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's equivalent. Confirmed by checking the accuracy of models before and after.



class Kinect400Eval(nn.Module):
def __init__(
self,
resize_size: Tuple[int, int],
crop_size: Tuple[int, int],
resize_size: Tuple[int, int],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unrelated to this PR, but why do we use Tuple[int, int] here for the sizes, but plain int's for ImageNetEval?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a good point. This is how the original recipes have been implemented, so I need to do this here too. If you see the resize operates differently if you specify both dimensions vs only 1. Prior merging we should consider adding unions etc here and cleaning up further.

mean: Tuple[float, ...] = (0.43216, 0.394666, 0.37645),
std: Tuple[float, ...] = (0.22803, 0.22145, 0.216989),
interpolation: T.InterpolationMode = T.InterpolationMode.BILINEAR,
interpolation: InterpolationMode = InterpolationMode.BILINEAR,
) -> None:
super().__init__()
self._convert = T.ConvertImageDtype(torch.float)
self._resize = T.Resize(resize_size, interpolation=interpolation)
self._normalize = T.Normalize(mean=mean, std=std)
self._crop = T.CenterCrop(crop_size)
self._crop_size = list(crop_size)
self._size = list(resize_size)
self._mean = list(mean)
self._std = list(std)
self._interpolation = interpolation

def forward(self, vid: Tensor) -> Tensor:
vid = vid.permute(0, 3, 1, 2) # (T, H, W, C) => (T, C, H, W)
vid = self._convert(vid)
vid = self._resize(vid)
vid = self._normalize(vid)
vid = self._crop(vid)
vid = F.resize(vid, self._size, interpolation=self._interpolation)
vid = F.center_crop(vid, self._crop_size)
vid = F.convert_image_dtype(vid, torch.float)
vid = F.normalize(vid, mean=self._mean, std=self._std)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I reordered the operations. This is more "usual/canonical" order of ops. I'm running tests to confirm the accuracy remains the same.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The accuracy before and after the change remains the same. The above change is safe.

return vid.permute(1, 0, 2, 3) # (T, C, H, W) => (C, T, H, W)


Expand All @@ -71,8 +75,8 @@ def __init__(
resize_size: int,
mean: Tuple[float, ...] = (0.485, 0.456, 0.406),
std: Tuple[float, ...] = (0.229, 0.224, 0.225),
interpolation: T.InterpolationMode = T.InterpolationMode.BILINEAR,
interpolation_target: T.InterpolationMode = T.InterpolationMode.NEAREST,
interpolation: InterpolationMode = InterpolationMode.BILINEAR,
interpolation_target: InterpolationMode = InterpolationMode.NEAREST,
) -> None:
super().__init__()
self._size = [resize_size]
Expand Down