Merge branch 'main' into canvas-size

pmeier · web-flow · commit 276701fa482f · 2023-08-01T09:37:52.000+02:00
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
@@ -9,3 +9,5 @@ d367a01a18a3ae6bee13d8be3b63fd6a581ea46f
 6ca9c76adb6daf2695d603ad623a9cf1c4f4806f
 # Fix unnecessary exploded black formatting (#7709)
 a335d916db0694770e8152f41e19195de3134523
+# Renaming: `BoundingBox` -> `BoundingBoxes` (#7778)
+332bff937c6711666191880fab57fa2f23ae772e
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
@@ -261,13 +261,13 @@ The new transform can be used standalone or mixed-and-matched with existing tran
     AugMix
     v2.AugMix
 
-Cutmix - Mixup
+CutMix - MixUp
 --------------
 
-Cutmix and Mixup are special transforms that
+CutMix and MixUp are special transforms that
 are meant to be used on batches rather than on individual images, because they
-are combining pairs of images together. These can be used after the dataloader,
-or part of a collation function. See
+are combining pairs of images together. These can be used after the dataloader
+(once the samples are batched), or part of a collation function. See
 :ref:`sphx_glr_auto_examples_plot_cutmix_mixup.py` for detailed usage examples.
 
 .. autosummary::
diff --git a/gallery/plot_cutmix_mixup.py b/gallery/plot_cutmix_mixup.py
@@ -1,8 +1,152 @@
 
 """
 ===========================
-How to use Cutmix and Mixup
+How to use CutMix and MixUp
 ===========================
 
-TODO
+:class:`~torchvision.transforms.v2.Cutmix` and
+:class:`~torchvision.transforms.v2.Mixup` are popular augmentation strategies
+that can improve classification accuracy.
+
+These transforms are slightly different from the rest of the Torchvision
+transforms, because they expect
+**batches** of samples as input, not individual images. In this example we'll
+explain how to use them: after the ``DataLoader``, or as part of a collation
+function.
 """
+
+# %%
+import torch
+import torchvision
+from torchvision.datasets import FakeData
+
+# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that
+# some APIs may slightly change in the future
+torchvision.disable_beta_transforms_warning()
+
+from torchvision.transforms import v2
+
+
+NUM_CLASSES = 100
+
+# %%
+# Pre-processing pipeline
+# -----------------------
+#
+# We'll use a simple but typical image classification pipeline:
+
+preproc = v2.Compose([
+    v2.PILToTensor(),
+    v2.RandomResizedCrop(size=(224, 224), antialias=True),
+    v2.RandomHorizontalFlip(p=0.5),
+    v2.ToDtype(torch.float32, scale=True),  # to float32 in [0, 1]
+    v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),  # typically from ImageNet
+])
+
+dataset = FakeData(size=1000, num_classes=NUM_CLASSES, transform=preproc)
+
+img, label = dataset[0]
+print(f"{type(img) = }, {img.dtype = }, {img.shape = }, {label = }")
+
+# %%
+#
+# One important thing to note is that neither CutMix nor MixUp are part of this
+# pre-processing pipeline. We'll add them a bit later once we define the
+# DataLoader. Just as a refresher, this is what the DataLoader and training loop
+# would look like if we weren't using CutMix or MixUp:
+
+from torch.utils.data import DataLoader
+
+dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
+
+for images, labels in dataloader:
+    print(f"{images.shape = }, {labels.shape = }")
+    print(labels.dtype)
+    # <rest of the training loop here>
+    break
+# %%
+
+# %%
+# Where to use MixUp and CutMix
+# -----------------------------
+#
+# After the DataLoader
+# ^^^^^^^^^^^^^^^^^^^^
+#
+# Now let's add CutMix and MixUp. The simplest way to do this right after the
+# DataLoader: the Dataloader has already batched the images and labels for us,
+# and this is exactly what these transforms expect as input:
+
+dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
+
+cutmix = v2.Cutmix(num_classes=NUM_CLASSES)
+mixup = v2.Mixup(num_classes=NUM_CLASSES)
+cutmix_or_mixup = v2.RandomChoice([cutmix, mixup])
+
+for images, labels in dataloader:
+    print(f"Before CutMix/MixUp: {images.shape = }, {labels.shape = }")
+    images, labels = cutmix_or_mixup(images, labels)
+    print(f"After CutMix/MixUp: {images.shape = }, {labels.shape = }")
+
+    # <rest of the training loop here>
+    break
+# %%
+#
+# Note how the labels were also transformed: we went from a batched label of
+# shape (batch_size,) to a tensor of shape (batch_size, num_classes). The
+# transformed labels can still be passed as-is to a loss function like
+# :func:`torch.nn.functional.cross_entropy`.
+#
+# As part of the collation function
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Passing the transforms after the DataLoader is the simplest way to use CutMix
+# and MixUp, but one disadvantage is that it does not take advantage of the
+# DataLoader multi-processing. For that, we can pass those transforms as part of
+# the collation function (refer to the `PyTorch docs
+# <https://pytorch.org/docs/stable/data.html#dataloader-collate-fn>`_ to learn
+# more about collation).
+
+from torch.utils.data import default_collate
+
+
+def collate_fn(batch):
+    return cutmix_or_mixup(*default_collate(batch))
+
+
+dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=2, collate_fn=collate_fn)
+
+for images, labels in dataloader:
+    print(f"{images.shape = }, {labels.shape = }")
+    # No need to call cutmix_or_mixup, it's already been called as part of the DataLoader!
+    # <rest of the training loop here>
+    break
+
+# %%
+# Non-standard input format
+# -------------------------
+#
+# So far we've used a typical sample structure where we pass ``(images,
+# labels)`` as inputs. MixUp and CutMix will magically work by default with most
+# common sample structures: tuples where the second parameter is a tensor label,
+# or dict with a "label[s]" key. Look at the documentation of the
+# ``labels_getter`` parameter for more details.
+#
+# If your samples have a different structure, you can still use CutMix and MixUp
+# by passing a callable to the ``labels_getter`` parameter. For example:
+
+batch = {
+    "imgs": torch.rand(4, 3, 224, 224),
+    "target": {
+        "classes": torch.randint(0, NUM_CLASSES, size=(4,)),
+        "some_other_key": "this is going to be passed-through"
+    }
+}
+
+
+def labels_getter(batch):
+    return batch["target"]["classes"]
+
+
+out = v2.Cutmix(num_classes=NUM_CLASSES, labels_getter=labels_getter)(batch)
+print(f"{out['imgs'].shape = }, {out['target']['classes'].shape = }")
diff --git a/references/detection/coco_utils.py b/references/detection/coco_utils.py
@@ -178,9 +178,7 @@ def get_coco_api_from_dataset(dataset):
             break
         if isinstance(dataset, torch.utils.data.Subset):
             dataset = dataset.dataset
-    if isinstance(dataset, torchvision.datasets.CocoDetection) or isinstance(
-        getattr(dataset, "_dataset", None), torchvision.datasets.CocoDetection
-    ):
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
         return dataset.coco
     return convert_to_coco_api(dataset)
 
diff --git a/references/detection/group_by_aspect_ratio.py b/references/detection/group_by_aspect_ratio.py
@@ -164,9 +164,7 @@ def compute_aspect_ratios(dataset, indices=None):
     if hasattr(dataset, "get_height_and_width"):
         return _compute_aspect_ratios_custom_dataset(dataset, indices)
 
-    if isinstance(dataset, torchvision.datasets.CocoDetection) or isinstance(
-        getattr(dataset, "_dataset", None), torchvision.datasets.CocoDetection
-    ):
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
         return _compute_aspect_ratios_coco_dataset(dataset, indices)
 
     if isinstance(dataset, torchvision.datasets.VOCDetection):
diff --git a/test/datasets_utils.py b/test/datasets_utils.py
@@ -571,7 +571,7 @@ def test_transforms_v2_wrapper(self, config):
         from torchvision.datasets import wrap_dataset_for_transforms_v2
 
         try:
-            with self.create_dataset(config) as (dataset, _):
+            with self.create_dataset(config) as (dataset, info):
                 for target_keys in [None, "all"]:
                     if target_keys is not None and self.DATASET_CLASS not in {
                         torchvision.datasets.CocoDetection,
@@ -584,8 +584,10 @@ def test_transforms_v2_wrapper(self, config):
                         continue
 
                     wrapped_dataset = wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys)
-                    wrapped_sample = wrapped_dataset[0]
+                    assert isinstance(wrapped_dataset, self.DATASET_CLASS)
+                    assert len(wrapped_dataset) == info["num_examples"]
 
+                    wrapped_sample = wrapped_dataset[0]
                     assert tree_any(lambda item: isinstance(item, (Datapoint, PIL.Image.Image)), wrapped_sample)
         except TypeError as error:
             msg = f"No wrapper exists for dataset class {type(dataset).__name__}"
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
@@ -1922,7 +1922,7 @@ def test_supported_input_structure(self, T):
 
         dataset = self.DummyDataset(size=batch_size, num_classes=num_classes)
 
-        cutmix_mixup = T(alpha=0.5, num_classes=num_classes)
+        cutmix_mixup = T(num_classes=num_classes)
 
         dl = DataLoader(dataset, batch_size=batch_size)
 
diff --git a/torchvision/datapoints/_dataset_wrapper.py b/torchvision/datapoints/_dataset_wrapper.py
@@ -8,7 +8,6 @@
 from collections import defaultdict
 
 import torch
-from torch.utils.data import Dataset
 
 from torchvision import datapoints, datasets
 from torchvision.transforms.v2 import functional as F
@@ -98,7 +97,16 @@ def wrap_dataset_for_transforms_v2(dataset, target_keys=None):
             f"but got {target_keys}"
         )
 
-    return VisionDatasetDatapointWrapper(dataset, target_keys)
+    # Imagine we have isinstance(dataset, datasets.ImageNet). This will create a new class with the name
+    # "WrappedImageNet" at runtime that doubly inherits from VisionDatasetDatapointWrapper (see below) as well as the
+    # original ImageNet class. This allows the user to do regular isinstance(wrapped_dataset, datasets.ImageNet) checks,
+    # while we can still inject everything that we need.
+    wrapped_dataset_cls = type(f"Wrapped{type(dataset).__name__}", (VisionDatasetDatapointWrapper, type(dataset)), {})
+    # Since VisionDatasetDatapointWrapper comes before ImageNet in the MRO, calling the class hits
+    # VisionDatasetDatapointWrapper.__init__ first. Since we are never doing super().__init__(...), the constructor of
+    # ImageNet is never hit. That is by design, since we don't want to create the dataset instance again, but rather
+    # have the existing instance as attribute on the new object.
+    return wrapped_dataset_cls(dataset, target_keys)
 
 
 class WrapperFactories(dict):
@@ -117,7 +125,7 @@ def decorator(wrapper_factory):
 WRAPPER_FACTORIES = WrapperFactories()
 
 
-class VisionDatasetDatapointWrapper(Dataset):
+class VisionDatasetDatapointWrapper:
     def __init__(self, dataset, target_keys):
         dataset_cls = type(dataset)
 
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
@@ -141,9 +141,9 @@ def _transform(
 
 
 class _BaseMixupCutmix(Transform):
-    def __init__(self, *, alpha: float = 1, num_classes: int, labels_getter="default") -> None:
+    def __init__(self, *, alpha: float = 1.0, num_classes: int, labels_getter="default") -> None:
         super().__init__()
-        self.alpha = alpha
+        self.alpha = float(alpha)
         self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha]))
 
         self.num_classes = num_classes
@@ -204,13 +204,20 @@ def _mixup_label(self, label: torch.Tensor, *, lam: float) -> torch.Tensor:
 
 
 class Mixup(_BaseMixupCutmix):
-    """[BETA] Apply Mixup to the provided batch of images and labels.
+    """[BETA] Apply MixUp to the provided batch of images and labels.
 
     .. v2betastatus:: Mixup transform
 
     Paper: `mixup: Beyond Empirical Risk Minimization <https://arxiv.org/abs/1710.09412>`_.
 
-    See :ref:`sphx_glr_auto_examples_plot_cutmix_mixup.py` for detailed usage examples.
+    .. note::
+        This transform is meant to be used on **batches** of samples, not
+        individual images. See
+        :ref:`sphx_glr_auto_examples_plot_cutmix_mixup.py` for detailed usage
+        examples.
+        The sample pairing is deterministic and done by matching consecutive
+        samples in the batch, so the batch needs to be shuffled (this is an
+        implementation detail, not a guaranteed convention.)
 
     In the input, the labels are expected to be a tensor of shape ``(batch_size,)``. They will be transformed
     into a tensor of shape ``(batch_size, num_classes)``.
@@ -246,14 +253,21 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class Cutmix(_BaseMixupCutmix):
-    """[BETA] Apply Cutmix to the provided batch of images and labels.
+    """[BETA] Apply CutMix to the provided batch of images and labels.
 
     .. v2betastatus:: Cutmix transform
 
     Paper: `CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features
     <https://arxiv.org/abs/1905.04899>`_.
 
-    See :ref:`sphx_glr_auto_examples_plot_cutmix_mixup.py` for detailed usage examples.
+    .. note::
+        This transform is meant to be used on **batches** of samples, not
+        individual images. See
+        :ref:`sphx_glr_auto_examples_plot_cutmix_mixup.py` for detailed usage
+        examples.
+        The sample pairing is deterministic and done by matching consecutive
+        samples in the batch, so the batch needs to be shuffled (this is an
+        implementation detail, not a guaranteed convention.)
 
     In the input, the labels are expected to be a tensor of shape ``(batch_size,)``. They will be transformed
     into a tensor of shape ``(batch_size, num_classes)``.