Add initial structures folder for bounding boxes

fmassa · fmassa · commit 25257fabf623 · 2018-05-02T14:26:47.000-06:00
diff --git a/torchvision/layers/nms.py b/torchvision/layers/nms.py
@@ -2,5 +2,5 @@
 from torchvision import _C
 
 nms = _C.nms
-nms.__doc__ = """
-This function performs Non-maximum suppresion"""
+# nms.__doc__ = """
+# This function performs Non-maximum suppresion"""
diff --git a/torchvision/structures/__init__.py b/torchvision/structures/__init__.py
diff --git a/torchvision/structures/bounding_box.py b/torchvision/structures/bounding_box.py
@@ -0,0 +1,178 @@
+import torch
+
+
+# transpose
+FLIP_LEFT_RIGHT = 0
+FLIP_TOP_BOTTOM = 1
+
+
+class BBox(object):
+    """
+    This class represents a set of bounding boxes.
+    The bounding boxes are represented as a Nx4 Tensor.
+    In order ot uniquely determine the bounding boxes with respect
+    to an image, we also store the corresponding image dimensions.
+    They can contain extra information that is specific to each bounding box, such as
+    labels.
+    """
+    def __init__(self, bbox, image_size, mode='xyxy'):
+        device = bbox.device if isinstance(bbox, torch.Tensor) else torch.device('cpu')
+        bbox = torch.tensor(bbox, dtype=torch.float32, device=device)
+        if bbox.ndimension() != 2:
+            raise ValueError(
+                    "bbox should have 2 dimensions, got {}".format(bbox.ndimension()))
+        if bbox.size(-1) != 4:
+            raise ValueError(
+                    "last dimenion of bbox should have a "
+                    "size of 4, got {}".format(bbox.size(-1)))
+        if mode not in ('xyxy', 'xywh'):
+            raise ValueError(
+                    "mode should be 'xyxy' or 'xywh'")
+
+        self.bbox = bbox
+        self.size = image_size  # (image_width, image_height)
+        self.mode = mode
+        self.extra_fields = {}
+
+    def add_field(self, field, field_data):
+        self.extra_fields[field] = field_data
+
+    def get_field(self, field):
+        return self.extra_fields[field]
+
+    def fields(self):
+        return list(self.extra_fields.keys())
+
+    def _copy_extra_fields(self, bbox):
+        for k, v in bbox.extra_fields.items():
+            self.extra_fields[k] = v
+
+    def convert(self, mode):
+        if mode not in ('xyxy', 'xywh'):
+            raise ValueError(
+                    "mode should be 'xyxy' or 'xywh'")
+        if mode == self.mode:
+            return self
+        # we only have two modes, so don't need to check
+        # self.mode
+        xmin, ymin, xmax, ymax = self._split()
+        if mode == 'xyxy':
+            bbox = torch.cat(
+                    (xmin, ymin, xmax, ymax), dim=-1)
+            bbox = BBox(bbox, self.size, mode=mode)
+        else:
+            bbox = torch.cat(
+                    (xmin, ymin, xmax - xmin, ymax - ymin), dim=-1)
+            bbox = BBox(bbox, self.size, mode=mode)
+        bbox._copy_extra_fields(self)
+        return bbox
+
+    def _split(self):
+        if self.mode == 'xyxy':
+            xmin, ymin, xmax, ymax = self.bbox.split(1, dim=-1)
+            return xmin, ymin, xmax, ymax
+        elif self.mode == 'xywh':
+            xmin, ymin, w, h = self.bbox.split(1, dim=-1)
+            return xmin, ymin, xmin + w, ymin + h
+        else:
+            raise RuntimeError('Should not be here')
+
+    def resize(self, size, *args, **kwargs):
+        """
+        Returns a resized copy of this bounding box
+
+        :param size: The requested size in pixels, as a 2-tuple:
+            (width, height).
+        """
+
+        ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size))
+        if ratios[0] == ratios[1]:
+            ratio = ratios[0]
+            scaled_box = self.bbox * ratio
+            bbox = BBox(scaled_box, size, mode=self.mode)
+            bbox._copy_extra_fields(self)
+            return bbox
+
+        ratio_width, ratio_height = ratios
+        xmin, ymin, xmax, ymax = self._split()
+        scaled_xmin = xmin * ratio_width
+        scaled_xmax = xmax * ratio_width
+        scaled_ymin = ymin * ratio_height
+        scaled_ymax = ymax * ratio_height
+        scaled_box = torch.cat(
+                (scaled_xmin, scaled_ymin, scaled_xmax, scaled_ymax), dim=-1)
+        bbox = BBox(scaled_box, size, mode='xyxy')
+        bbox._copy_extra_fields(self)
+        return bbox.convert(self.mode)
+
+    def transpose(self, method):
+        """
+        Transpose bounding box (flip or rotate in 90 degree steps)
+        :param method: One of :py:attr:`PIL.Image.FLIP_LEFT_RIGHT`,
+          :py:attr:`PIL.Image.FLIP_TOP_BOTTOM`, :py:attr:`PIL.Image.ROTATE_90`,
+          :py:attr:`PIL.Image.ROTATE_180`, :py:attr:`PIL.Image.ROTATE_270`,
+          :py:attr:`PIL.Image.TRANSPOSE` or :py:attr:`PIL.Image.TRANSVERSE`.
+        """
+        if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM):
+            raise NotImplementedError(
+                    "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented")
+        image_width, image_height = self.size
+        xmin, ymin, xmax, ymax = self._split()
+        if method == FLIP_LEFT_RIGHT:
+            transposed_xmin = image_width - xmax
+            transposed_xmax = image_width - xmin
+            transposed_ymin = ymin
+            transposed_ymax = ymax
+        elif method == FLIP_TOP_BOTTOM:
+            transposed_xmin = xmin
+            transposed_xmax = xmax
+            transposed_ymin = image_height - ymax
+            transposed_ymax = image_height - ymin
+
+        transposed_boxes = torch.cat(
+                (transposed_xmin, transposed_ymin, transposed_xmax, transposed_ymax), dim=-1)
+        bbox = BBox(transposed_boxes, self.size, mode='xyxy')
+        bbox._copy_extra_fields(self)
+        return bbox.convert(self.mode)
+
+    def crop(self, box):
+        """
+        Cropss a rectangular region from this bounding box. The box is a
+        4-tuple defining the left, upper, right, and lower pixel
+        coordinate.
+        """
+        xmin, ymin, xmax, ymax = self._split()
+        w, h = box[2] - box[0], box[3] - box[1]
+        cropped_xmin = (xmin - box[0]).clamp(min=0, max=w)
+        cropped_ymin = (ymin - box[1]).clamp(min=0, max=h)
+        cropped_xmax = (xmax - box[0]).clamp(min=0, max=w)
+        cropped_ymax = (ymax - box[1]).clamp(min=0, max=h)
+
+        # TODO should I filter empty boxes here?
+        if False:
+            is_empty = (cropped_xmin == cropped_xmax) | (cropped_ymin == cropped_ymax)
+
+        cropped_box = torch.cat(
+                (cropped_xmin, cropped_ymin, cropped_xmax, cropped_ymax), dim=-1)
+        bbox = BBox(cropped_box, (w, h), mode='xyxy')
+        bbox._copy_extra_fields(self)
+        return bbox.convert(self.mode)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += 'num_boxes={}, '.format(self.bbox.size(0))
+        s += 'image_width={}, '.format(self.size[0])
+        s += 'image_height={}, '.format(self.size[1])
+        s += 'mode={})'.format(self.mode)
+        return s
+
+
+if __name__ == '__main__':
+    bbox = BBox([[0, 0, 10, 10], [0, 0, 5, 5]], (10, 10))
+    s_bbox = bbox.resize((5, 5))
+    print(s_bbox)
+    print(s_bbox.bbox)
+
+    t_bbox = bbox.transpose(0)
+    print(t_bbox)
+    print(t_bbox.bbox)
diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
@@ -13,6 +13,8 @@
 import collections
 import warnings
 
+from ..structures.bounding_box import BBox
+
 
 def _is_pil_image(img):
     if accimage is not None:
@@ -167,7 +169,7 @@ def normalize(tensor, mean, std):
     return tensor
 
 
-def resize(img, size, interpolation=Image.BILINEAR):
+def resize(img, size, interpolation=Image.BILINEAR, max_size=None):
     """Resize the input PIL Image to the given size.
 
     Args:
@@ -183,15 +185,23 @@ def resize(img, size, interpolation=Image.BILINEAR):
     Returns:
         PIL Image: Resized image.
     """
-    if not _is_pil_image(img):
+    if not (_is_pil_image(img) or isinstance(img, BBox)):
         raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
     if not (isinstance(size, int) or (isinstance(size, collections.Iterable) and len(size) == 2)):
         raise TypeError('Got inappropriate size arg: {}'.format(size))
 
     if isinstance(size, int):
         w, h = img.size
+
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+
         if (w <= h and w == size) or (h <= w and h == size):
             return img
+
         if w < h:
             ow = size
             oh = int(size * h / w)
@@ -291,7 +301,7 @@ def crop(img, i, j, h, w):
     Returns:
         PIL Image: Cropped image.
     """
-    if not _is_pil_image(img):
+    if not (_is_pil_image(img) or isinstance(img, BBox)):
         raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
 
     return img.crop((j, i, j + w, i + h))
@@ -339,7 +349,7 @@ def hflip(img):
     Returns:
         PIL Image:  Horizontall flipped image.
     """
-    if not _is_pil_image(img):
+    if not (_is_pil_image(img) or isinstance(img, BBox)):
         raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
 
     return img.transpose(Image.FLIP_LEFT_RIGHT)