diff --git a/docs/source/datapoints.rst b/docs/source/datapoints.rst
new file mode 100644
index 00000000000..07e20b090e6
--- /dev/null
+++ b/docs/source/datapoints.rst
@@ -0,0 +1,13 @@
+Datapoints
+==========
+
+.. currentmodule:: torchvision.datapoints
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    Image
+    Video
+    BoundingBoxFormat
+    BoundingBox
+    Mask
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 79dbebdd047..ac047ff5869 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -31,6 +31,7 @@ architectures, and common image transformations for computer vision.
    :maxdepth: 2
    :caption: Package Reference
 
+   datapoints
    transforms
    models
    datasets
diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py
index 75e779f0b21..d8441823c3e 100644
--- a/torchvision/datapoints/_bounding_box.py
+++ b/torchvision/datapoints/_bounding_box.py
@@ -10,12 +10,35 @@
 
 
 class BoundingBoxFormat(Enum):
+    """[BETA] Coordinate format of a bounding box.
+
+    Available formats are
+
+    * ``XYXY``
+    * ``XYWH``
+    * ``CXCYWH``
+    """
+
     XYXY = "XYXY"
     XYWH = "XYWH"
     CXCYWH = "CXCYWH"
 
 
 class BoundingBox(Datapoint):
+    """[BETA] :class:`torch.Tensor` subclass for bounding boxes.
+
+    Args:
+        data: Any data that can be turned into a tensor with :func:`torch.as_tensor`.
+        format (BoundingBoxFormat, str): Format of the bounding box.
+        spatial_size (two-tuple of ints): Height and width of the corresponding image or video.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
     format: BoundingBoxFormat
     spatial_size: Tuple[int, int]
 
@@ -52,6 +75,20 @@ def wrap_like(
         format: Optional[BoundingBoxFormat] = None,
         spatial_size: Optional[Tuple[int, int]] = None,
     ) -> BoundingBox:
+        """Wrap a :class:`torch.Tensor` as :class:`BoundingBox` from a reference.
+
+        Args:
+            other (BoundingBox): Reference bounding box.
+            tensor (Tensor): Tensor to be wrapped as :class:`BoundingBox`
+            format (BoundingBoxFormat, str, optional): Format of the bounding box.  If omitted, it is taken from the
+                reference.
+            spatial_size (two-tuple of ints, optional): Height and width of the corresponding image or video. If
+                omitted, it is taken from the reference.
+
+        """
+        if isinstance(format, str):
+            format = BoundingBoxFormat.from_str(format.upper())
+
         return cls._wrap(
             tensor,
             format=format if format is not None else other.format,
diff --git a/torchvision/datapoints/_image.py b/torchvision/datapoints/_image.py
index 21dfe5a5cd6..e47a6c10fc3 100644
--- a/torchvision/datapoints/_image.py
+++ b/torchvision/datapoints/_image.py
@@ -10,6 +10,19 @@
 
 
 class Image(Datapoint):
+    """[BETA] :class:`torch.Tensor` subclass for images.
+
+    Args:
+        data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as
+            well as PIL images.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
     @classmethod
     def _wrap(cls, tensor: torch.Tensor) -> Image:
         image = tensor.as_subclass(cls)
diff --git a/torchvision/datapoints/_mask.py b/torchvision/datapoints/_mask.py
index bb70ec12224..0135d793d32 100644
--- a/torchvision/datapoints/_mask.py
+++ b/torchvision/datapoints/_mask.py
@@ -10,6 +10,19 @@
 
 
 class Mask(Datapoint):
+    """[BETA] :class:`torch.Tensor` subclass for segmentation and detection masks.
+
+    Args:
+        data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as
+            well as PIL images.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
     @classmethod
     def _wrap(cls, tensor: torch.Tensor) -> Mask:
         return tensor.as_subclass(cls)
diff --git a/torchvision/datapoints/_video.py b/torchvision/datapoints/_video.py
index ab51c10233d..a6fbe2bd473 100644
--- a/torchvision/datapoints/_video.py
+++ b/torchvision/datapoints/_video.py
@@ -9,6 +9,18 @@
 
 
 class Video(Datapoint):
+    """[BETA] :class:`torch.Tensor` subclass for videos.
+
+    Args:
+        data (tensor-like): Any data that can be turned into a tensor with :func:`torch.as_tensor`.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
     @classmethod
     def _wrap(cls, tensor: torch.Tensor) -> Video:
         video = tensor.as_subclass(cls)