diff --git a/slowfast/datasets/transform.py b/slowfast/datasets/transform.py index 0fe026753..b8f5a6c30 100644 --- a/slowfast/datasets/transform.py +++ b/slowfast/datasets/transform.py @@ -46,7 +46,7 @@ def random_short_side_scale_jitter( corresponding boxes. Args: images (tensor): images to perform scale jitter. Dimension is - `num frames` x `channel` x `height` x `width`. + `channel` x `num frames` x `height` x `width`. min_size (int): the minimal size to scale the frames. max_size (int): the maximal size to scale the frames. boxes (ndarray): optional. Corresponding boxes to images. @@ -56,7 +56,7 @@ def random_short_side_scale_jitter( scale. If False, take a uniform sample from [min_scale, max_scale]. Returns: (tensor): the scaled images with dimension of - `num frames` x `channel` x `new height` x `new width`. + `channel` x `num frames` x `new height` x `new width`. (ndarray or None): the scaled boxes with dimension of `num boxes` x 4. """ @@ -119,13 +119,13 @@ def random_crop(images, size, boxes=None): Perform random spatial crop on the given images and corresponding boxes. Args: images (tensor): images to perform random crop. The dimension is - `num frames` x `channel` x `height` x `width`. + `channel` x `num frames` x `height` x `width`. size (int): the size of height and width to crop on the image. boxes (ndarray or None): optional. Corresponding boxes to images. Dimension is `num boxes` x 4. Returns: cropped (tensor): cropped images with dimension of - `num frames` x `channel` x `size` x `size`. + `channel` x `num frames` x `size` x `size`. cropped_boxes (ndarray or None): the cropped boxes with dimension of `num boxes` x 4. """ @@ -156,12 +156,12 @@ def horizontal_flip(prob, images, boxes=None): Args: prob (float): probility to flip the images. images (tensor): images to perform horizontal flip, the dimension is - `num frames` x `channel` x `height` x `width`. + `channel` x `num frames` x `height` x `width`. boxes (ndarray or None): optional. Corresponding boxes to images. Dimension is `num boxes` x 4. Returns: images (tensor): images with dimension of - `num frames` x `channel` x `height` x `width`. + `channel` x `num frames` x `height` x `width`. flipped_boxes (ndarray or None): the flipped boxes with dimension of `num boxes` x 4. """ diff --git a/slowfast/datasets/utils.py b/slowfast/datasets/utils.py index 47a1b7f44..d9a7b75d2 100644 --- a/slowfast/datasets/utils.py +++ b/slowfast/datasets/utils.py @@ -125,7 +125,7 @@ def spatial_sampling( with the given spatial_idx. Args: frames (tensor): frames of images sampled from the video. The - dimension is `num frames` x `height` x `width` x `channel`. + dimension is `channel` x `num frames` x `height` x `width`. spatial_idx (int): if -1, perform random spatial sampling. If 0, 1, or 2, perform left, center, right crop if width is larger than height, and perform top, center, buttom crop if height is larger