Specify coordinate constraints for box parameters (#3425)

NicolasHug · web-flow · commit 433283ebbf62 · 2021-02-22T16:23:34.000+01:00
* Specify coordinate constraints

* some more

* flake8
diff --git a/torchvision/models/detection/faster_rcnn.py b/torchvision/models/detection/faster_rcnn.py
@@ -32,8 +32,8 @@ class FasterRCNN(GeneralizedRCNN):
 
     During training, the model expects both the input tensors, as well as a targets (list of dictionary),
     containing:
-        - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x1, y1, x2, y2] format, with values of x
-          between 0 and W and values of y between 0 and H
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
         - labels (Int64Tensor[N]): the class label for each ground-truth box
 
     The model returns a Dict[Tensor] during training, containing the classification and regression
@@ -42,8 +42,8 @@ class FasterRCNN(GeneralizedRCNN):
     During inference, the model requires only the input tensors, and returns the post-processed
     predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
     follows:
-        - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values of x
-          between 0 and W and values of y between 0 and H
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
         - labels (Int64Tensor[N]): the predicted labels for each image
         - scores (Tensor[N]): the scores or each prediction
 
@@ -309,8 +309,8 @@ def fasterrcnn_resnet50_fpn(pretrained=False, progress=True,
     During training, the model expects both the input tensors, as well as a targets (list of dictionary),
     containing:
 
-        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with values of ``x``
-          between ``0`` and ``W`` and values of ``y`` between ``0`` and ``H``
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
         - labels (``Int64Tensor[N]``): the class label for each ground-truth box
 
     The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
@@ -320,8 +320,8 @@ def fasterrcnn_resnet50_fpn(pretrained=False, progress=True,
     predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
     follows:
 
-        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with values of ``x``
-          between ``0`` and ``W`` and values of ``y`` between ``0`` and ``H``
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
         - labels (``Int64Tensor[N]``): the predicted labels for each image
         - scores (``Tensor[N]``): the scores or each prediction
 
diff --git a/torchvision/models/detection/keypoint_rcnn.py b/torchvision/models/detection/keypoint_rcnn.py
@@ -27,8 +27,8 @@ class KeypointRCNN(FasterRCNN):
     During training, the model expects both the input tensors, as well as a targets (list of dictionary),
     containing:
 
-        - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x1, y1, x2, y2] format, with values of x
-          between 0 and W and values of y between 0 and H
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+            ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
         - labels (Int64Tensor[N]): the class label for each ground-truth box
         - keypoints (FloatTensor[N, K, 3]): the K keypoints location for each of the N instances, in the
           format [x, y, visibility], where visibility=0 means that the keypoint is not visible.
@@ -40,8 +40,8 @@ class KeypointRCNN(FasterRCNN):
     predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
     follows:
 
-        - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values of x
-          between 0 and W and values of y between 0 and H
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+            ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
         - labels (Int64Tensor[N]): the predicted labels for each image
         - scores (Tensor[N]): the scores or each prediction
         - keypoints (FloatTensor[N, K, 3]): the locations of the predicted keypoints, in [x, y, v] format.
@@ -286,8 +286,8 @@ def keypointrcnn_resnet50_fpn(pretrained=False, progress=True,
     During training, the model expects both the input tensors, as well as a targets (list of dictionary),
     containing:
 
-        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with values of ``x``
-          between ``0`` and ``W`` and values of ``y`` between ``0`` and ``H``
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
         - labels (``Int64Tensor[N]``): the class label for each ground-truth box
         - keypoints (``FloatTensor[N, K, 3]``): the ``K`` keypoints location for each of the ``N`` instances, in the
           format ``[x, y, visibility]``, where ``visibility=0`` means that the keypoint is not visible.
@@ -299,8 +299,8 @@ def keypointrcnn_resnet50_fpn(pretrained=False, progress=True,
     predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
     follows:
 
-        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format,  with values of ``x``
-          between ``0`` and ``W`` and values of ``y`` between ``0`` and ``H``
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
         - labels (``Int64Tensor[N]``): the predicted labels for each image
         - scores (``Tensor[N]``): the scores or each prediction
         - keypoints (``FloatTensor[N, K, 3]``): the locations of the predicted keypoints, in ``[x, y, v]`` format.
diff --git a/torchvision/models/detection/mask_rcnn.py b/torchvision/models/detection/mask_rcnn.py
@@ -26,8 +26,8 @@ class MaskRCNN(FasterRCNN):
 
     During training, the model expects both the input tensors, as well as a targets (list of dictionary),
     containing:
-        - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x1, y1, x2, y2] format, with values of x
-          between 0 and W and values of y between 0 and H
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
         - labels (Int64Tensor[N]): the class label for each ground-truth box
         - masks (UInt8Tensor[N, H, W]): the segmentation binary masks for each instance
 
@@ -37,8 +37,8 @@ class MaskRCNN(FasterRCNN):
     During inference, the model requires only the input tensors, and returns the post-processed
     predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
     follows:
-        - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values of x
-          between 0 and W and values of y between 0 and H
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
         - labels (Int64Tensor[N]): the predicted labels for each image
         - scores (Tensor[N]): the scores or each prediction
         - masks (UInt8Tensor[N, 1, H, W]): the predicted masks for each instance, in 0-1 range. In order to
@@ -279,8 +279,8 @@ def maskrcnn_resnet50_fpn(pretrained=False, progress=True,
     During training, the model expects both the input tensors, as well as a targets (list of dictionary),
     containing:
 
-        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format,  with values of ``x``
-          between ``0`` and ``W`` and values of ``y`` between ``0`` and ``H``
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
         - labels (``Int64Tensor[N]``): the class label for each ground-truth box
         - masks (``UInt8Tensor[N, H, W]``): the segmentation binary masks for each instance
 
@@ -291,8 +291,8 @@ def maskrcnn_resnet50_fpn(pretrained=False, progress=True,
     predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
     follows:
 
-        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format,  with values of ``x``
-          between ``0`` and ``W`` and values of ``y`` between ``0`` and ``H``
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
         - labels (``Int64Tensor[N]``): the predicted labels for each image
         - scores (``Tensor[N]``): the scores or each prediction
         - masks (``UInt8Tensor[N, 1, H, W]``): the predicted masks for each instance, in ``0-1`` range. In order to
diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
@@ -236,8 +236,8 @@ class RetinaNet(nn.Module):
 
     During training, the model expects both the input tensors, as well as a targets (list of dictionary),
     containing:
-        - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x1, y1, x2, y2] format, with values
-          between 0 and H and 0 and W
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
         - labels (Int64Tensor[N]): the class label for each ground-truth box
 
     The model returns a Dict[Tensor] during training, containing the classification and regression
@@ -246,8 +246,8 @@ class RetinaNet(nn.Module):
     During inference, the model requires only the input tensors, and returns the post-processed
     predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
     follows:
-        - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values between
-          0 and H and 0 and W
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
         - labels (Int64Tensor[N]): the predicted labels for each image
         - scores (Tensor[N]): the scores for each prediction
 
@@ -576,8 +576,8 @@ def retinanet_resnet50_fpn(pretrained=False, progress=True,
     During training, the model expects both the input tensors, as well as a targets (list of dictionary),
     containing:
 
-        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with values
-          between ``0`` and ``H`` and ``0`` and ``W``
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
         - labels (``Int64Tensor[N]``): the class label for each ground-truth box
 
     The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
@@ -587,8 +587,8 @@ def retinanet_resnet50_fpn(pretrained=False, progress=True,
     predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
     follows:
 
-        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with values between
-          ``0`` and ``H`` and ``0`` and ``W``
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
         - labels (``Int64Tensor[N]``): the predicted labels for each image
         - scores (``Tensor[N]``): the scores or each prediction
 
diff --git a/torchvision/ops/boxes.py b/torchvision/ops/boxes.py
@@ -22,7 +22,8 @@ def nms(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:
 
     Args:
         boxes (Tensor[N, 4])): boxes to perform NMS on. They
-            are expected to be in (x1, y1, x2, y2) format
+            are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
+            ``0 <= y1 < y2``.
         scores (Tensor[N]): scores for each one of the boxes
         iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
 
@@ -50,7 +51,8 @@ def batched_nms(
 
     Args:
         boxes (Tensor[N, 4]): boxes where NMS will be performed. They
-            are expected to be in (x1, y1, x2, y2) format
+            are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
+            ``0 <= y1 < y2``.
         scores (Tensor[N]): scores for each one of the boxes
         idxs (Tensor[N]): indices of the categories for each one of the boxes.
         iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
@@ -79,7 +81,8 @@ def remove_small_boxes(boxes: Tensor, min_size: float) -> Tensor:
     Remove boxes which contains at least one side smaller than min_size.
 
     Args:
-        boxes (Tensor[N, 4]): boxes in (x1, y1, x2, y2) format
+        boxes (Tensor[N, 4]): boxes in ``(x1, y1, x2, y2)`` format
+            with ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
         min_size (float): minimum size
 
     Returns:
@@ -97,7 +100,8 @@ def clip_boxes_to_image(boxes: Tensor, size: Tuple[int, int]) -> Tensor:
     Clip boxes so that they lie inside an image of size `size`.
 
     Args:
-        boxes (Tensor[N, 4]): boxes in (x1, y1, x2, y2) format
+        boxes (Tensor[N, 4]): boxes in ``(x1, y1, x2, y2)`` format
+            with ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
         size (Tuple[height, width]): size of the image
 
     Returns:
@@ -185,7 +189,8 @@ def box_area(boxes: Tensor) -> Tensor:
 
     Args:
         boxes (Tensor[N, 4]): boxes for which the area will be computed. They
-            are expected to be in (x1, y1, x2, y2) format
+            are expected to be in (x1, y1, x2, y2) format with
+            ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
 
     Returns:
         area (Tensor[N]): area for each box
@@ -215,7 +220,8 @@ def box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
     """
     Return intersection-over-union (Jaccard index) of boxes.
 
-    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
+    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
 
     Args:
         boxes1 (Tensor[N, 4])
@@ -234,7 +240,8 @@ def generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
     """
     Return generalized intersection-over-union (Jaccard index) of boxes.
 
-    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
+    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
 
     Args:
         boxes1 (Tensor[N, 4])
diff --git a/torchvision/ops/poolers.py b/torchvision/ops/poolers.py
@@ -204,7 +204,7 @@ def forward(
                 all the same number of channels, but they can have different sizes.
             boxes (List[Tensor[N, 4]]): boxes to be used to perform the pooling operation, in
                 (x1, y1, x2, y2) format and in the image reference size, not the feature map
-                reference.
+                reference. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
             image_shapes (List[Tuple[height, width]]): the sizes of each image before they
                 have been fed to a CNN to obtain feature maps. This allows us to infer the
                 scale factor for each one of the levels to be pooled.
diff --git a/torchvision/ops/ps_roi_align.py b/torchvision/ops/ps_roi_align.py
@@ -21,7 +21,9 @@ def ps_roi_align(
     Args:
         input (Tensor[N, C, H, W]): input tensor
         boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2)
-            format where the regions will be taken from. If a single Tensor is passed,
+            format where the regions will be taken from.
+            The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+            If a single Tensor is passed,
             then the first column should contain the batch index. If a list of Tensors
             is passed, then each Tensor will correspond to the boxes for an element i
             in a batch
diff --git a/torchvision/ops/ps_roi_pool.py b/torchvision/ops/ps_roi_pool.py
@@ -20,7 +20,9 @@ def ps_roi_pool(
     Args:
         input (Tensor[N, C, H, W]): input tensor
         boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2)
-            format where the regions will be taken from. If a single Tensor is passed,
+            format where the regions will be taken from.
+            The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+            If a single Tensor is passed,
             then the first column should contain the batch index. If a list of Tensors
             is passed, then each Tensor will correspond to the boxes for an element i
             in a batch
diff --git a/torchvision/ops/roi_align.py b/torchvision/ops/roi_align.py
@@ -22,7 +22,9 @@ def roi_align(
     Args:
         input (Tensor[N, C, H, W]): input tensor
         boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2)
-            format where the regions will be taken from. If a single Tensor is passed,
+            format where the regions will be taken from.
+            The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+            If a single Tensor is passed,
             then the first column should contain the batch index. If a list of Tensors
             is passed, then each Tensor will correspond to the boxes for an element i
             in a batch
diff --git a/torchvision/ops/roi_pool.py b/torchvision/ops/roi_pool.py
@@ -20,7 +20,9 @@ def roi_pool(
     Args:
         input (Tensor[N, C, H, W]): input tensor
         boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2)
-            format where the regions will be taken from. If a single Tensor is passed,
+            format where the regions will be taken from.
+            The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+            If a single Tensor is passed,
             then the first column should contain the batch index. If a list of Tensors
             is passed, then each Tensor will correspond to the boxes for an element i
             in a batch