From 685e16eb0aebad690e540a7e07013462d78f1707 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 11 Jul 2023 15:19:27 +0200
Subject: [PATCH 1/2] remove spatial_size

---
 gallery/plot_datapoints.py                    |   8 +-
 gallery/plot_transforms_v2.py                 |   2 +-
 test/common_utils.py                          |  60 ++-----
 test/test_datapoints.py                       |   4 +-
 test/test_prototype_transforms.py             |   4 +-
 test/test_transforms_v2.py                    | 158 ++++++++----------
 test/test_transforms_v2_consistency.py        |  10 +-
 test/test_transforms_v2_functional.py         |  80 ++++-----
 test/test_transforms_v2_refactored.py         |  54 +++---
 test/test_transforms_v2_utils.py              |   8 +-
 test/transforms_v2_kernel_infos.py            |  64 +++----
 torchvision/datapoints/_bounding_box.py       |  62 +++----
 torchvision/datapoints/_datapoint.py          |   2 +-
 torchvision/datapoints/_dataset_wrapper.py    |  20 +--
 torchvision/datapoints/_image.py              |  10 +-
 torchvision/datapoints/_mask.py               |   6 +-
 torchvision/datapoints/_video.py              |  14 +-
 torchvision/prototype/transforms/_augment.py  |   4 +-
 torchvision/prototype/transforms/_geometry.py |  10 +-
 torchvision/transforms/v2/_auto_augment.py    |  10 +-
 torchvision/transforms/v2/_geometry.py        |  20 +--
 torchvision/transforms/v2/_meta.py            |   2 +-
 torchvision/transforms/v2/_misc.py            |   2 +-
 .../transforms/v2/functional/__init__.py      |  12 +-
 .../transforms/v2/functional/_deprecated.py   |   2 +-
 .../transforms/v2/functional/_geometry.py     |  90 +++++-----
 torchvision/transforms/v2/functional/_meta.py | 118 +++++++------
 torchvision/transforms/v2/utils.py            |  21 ++-
 28 files changed, 403 insertions(+), 454 deletions(-)

diff --git a/gallery/plot_datapoints.py b/gallery/plot_datapoints.py
index 5094de13a3e..52a44515b58 100644
--- a/gallery/plot_datapoints.py
+++ b/gallery/plot_datapoints.py
@@ -76,11 +76,11 @@
 
 ########################################################################################################################
 # In general, the datapoints can also store additional metadata that complements the underlying tensor. For example,
-# :class:`~torchvision.datapoints.BoundingBox` stores the coordinate format as well as the spatial size of the
+# :class:`~torchvision.datapoints.BoundingBox` stores the coordinate format as well as the canvas size of the
 # corresponding image alongside the actual values:
 
 bounding_box = datapoints.BoundingBox(
-    [17, 16, 344, 495], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=image.shape[-2:]
+    [17, 16, 344, 495], format=datapoints.BoundingBoxFormat.XYXY, canvas_size=F.get_size(image)
 )
 print(bounding_box)
 
@@ -108,7 +108,7 @@ def __getitem__(self, item):
         target["boxes"] = datapoints.BoundingBox(
             boxes,
             format=datapoints.BoundingBoxFormat.XYXY,
-            spatial_size=F.get_spatial_size(img),
+            canvas_size=F.get_size(img),
         )
         target["labels"] = labels
         target["masks"] = datapoints.Mask(masks)
@@ -129,7 +129,7 @@ def __call__(self, img, target):
         target["boxes"] = datapoints.BoundingBox(
             target["boxes"],
             format=datapoints.BoundingBoxFormat.XYXY,
-            spatial_size=F.get_spatial_size(img),
+            canvas_size=F.get_size(img),
         )
         target["masks"] = datapoints.Mask(target["masks"])
         return img, target
diff --git a/gallery/plot_transforms_v2.py b/gallery/plot_transforms_v2.py
index d1096bec1e7..daf78b77ad9 100644
--- a/gallery/plot_transforms_v2.py
+++ b/gallery/plot_transforms_v2.py
@@ -30,7 +30,7 @@ def load_data():
     masks = datapoints.Mask(merged_masks == labels.view(-1, 1, 1))
 
     bounding_boxes = datapoints.BoundingBox(
-        masks_to_boxes(masks), format=datapoints.BoundingBoxFormat.XYXY, spatial_size=image.shape[-2:]
+        masks_to_boxes(masks), format=datapoints.BoundingBoxFormat.XYXY, canvas_size=image.shape[-2:]
     )
 
     return path, image, bounding_boxes, masks, labels
diff --git a/test/common_utils.py b/test/common_utils.py
index 72ecf104301..021f6e28518 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -470,9 +470,10 @@ class ImageLoader(TensorLoader):
     spatial_size: Tuple[int, int] = dataclasses.field(init=False)
     num_channels: int = dataclasses.field(init=False)
     memory_format: torch.memory_format = torch.contiguous_format
+    canvas_size: Tuple[int, int] = dataclasses.field(init=False)
 
     def __post_init__(self):
-        self.spatial_size = self.shape[-2:]
+        self.spatial_size = self.canvas_size = self.shape[-2:]
         self.num_channels = self.shape[-3]
 
     def load(self, device):
@@ -622,43 +623,20 @@ def make_image_loaders_for_interpolation(
 class BoundingBoxLoader(TensorLoader):
     format: datapoints.BoundingBoxFormat
     spatial_size: Tuple[int, int]
+    canvas_size: Tuple[int, int] = dataclasses.field(init=False)
+
+    def __post_init__(self):
+        self.canvas_size = self.spatial_size
 
 
 def make_bounding_box(
-    size=None,
+    canvas_size=DEFAULT_SIZE,
     *,
     format=datapoints.BoundingBoxFormat.XYXY,
-    spatial_size=None,
     batch_dims=(),
     dtype=None,
     device="cpu",
 ):
-    """
-    size: Size of the actual bounding box, i.e.
-        - (box[3] - box[1], box[2] - box[0]) for XYXY
-        - (H, W) for XYWH and CXCYWH
-    spatial_size: Size of the reference object, e.g. an image. Corresponds to the .spatial_size attribute on
-        returned datapoints.BoundingBox
-
-    To generate a valid joint sample, you need to set spatial_size here to the same value as size on the other maker
-    functions, e.g.
-
-    .. code::
-
-        image = make_image=(size=size)
-        bounding_box = make_bounding_box(spatial_size=size)
-        assert F.get_spatial_size(bounding_box) == F.get_spatial_size(image)
-
-    For convenience, if both size and spatial_size are omitted, spatial_size defaults to the same value as size for all
-    other maker functions, e.g.
-
-    .. code::
-
-        image = make_image=()
-        bounding_box = make_bounding_box()
-        assert F.get_spatial_size(bounding_box) == F.get_spatial_size(image)
-    """
-
     def sample_position(values, max_value):
         # We cannot use torch.randint directly here, because it only allows integer scalars as values for low and high.
         # However, if we have batch_dims, we need tensors as limits.
@@ -667,28 +645,16 @@ def sample_position(values, max_value):
     if isinstance(format, str):
         format = datapoints.BoundingBoxFormat[format]
 
-    if spatial_size is None:
-        if size is None:
-            spatial_size = DEFAULT_SIZE
-        else:
-            height, width = size
-            height_margin, width_margin = torch.randint(10, (2,)).tolist()
-            spatial_size = (height + height_margin, width + width_margin)
-
     dtype = dtype or torch.float32
 
     if any(dim == 0 for dim in batch_dims):
         return datapoints.BoundingBox(
-            torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size
+            torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, canvas_size=canvas_size
         )
 
-    if size is None:
-        h, w = [torch.randint(1, s, batch_dims) for s in spatial_size]
-    else:
-        h, w = [torch.full(batch_dims, s, dtype=torch.int) for s in size]
-
-    y = sample_position(h, spatial_size[0])
-    x = sample_position(w, spatial_size[1])
+    h, w = [torch.randint(1, c, batch_dims) for c in canvas_size]
+    y = sample_position(h, canvas_size[0])
+    x = sample_position(w, canvas_size[1])
 
     if format is datapoints.BoundingBoxFormat.XYWH:
         parts = (x, y, w, h)
@@ -705,7 +671,7 @@ def sample_position(values, max_value):
         raise ValueError(f"Format {format} is not supported")
 
     return datapoints.BoundingBox(
-        torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size
+        torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, canvas_size=canvas_size
     )
 
 
@@ -721,7 +687,7 @@ def fn(shape, dtype, device):
             raise pytest.UsageError()
 
         return make_bounding_box(
-            format=format, spatial_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device
+            format=format, canvas_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device
         )
 
     return BoundingBoxLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size)
diff --git a/test/test_datapoints.py b/test/test_datapoints.py
index 1334fd7283b..8024ab3d2ef 100644
--- a/test/test_datapoints.py
+++ b/test/test_datapoints.py
@@ -27,7 +27,7 @@ def test_mask_instance(data):
     "format", ["XYXY", "CXCYWH", datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH]
 )
 def test_bbox_instance(data, format):
-    bboxes = datapoints.BoundingBox(data, format=format, spatial_size=(32, 32))
+    bboxes = datapoints.BoundingBox(data, format=format, canvas_size=(32, 32))
     assert isinstance(bboxes, torch.Tensor)
     assert bboxes.ndim == 2 and bboxes.shape[1] == 4
     if isinstance(format, str):
@@ -164,7 +164,7 @@ def test_wrap_like():
     [
         datapoints.Image(torch.rand(3, 16, 16)),
         datapoints.Video(torch.rand(2, 3, 16, 16)),
-        datapoints.BoundingBox([0.0, 1.0, 2.0, 3.0], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10)),
+        datapoints.BoundingBox([0.0, 1.0, 2.0, 3.0], format=datapoints.BoundingBoxFormat.XYXY, canvas_size=(10, 10)),
         datapoints.Mask(torch.randint(0, 256, (16, 16), dtype=torch.uint8)),
     ],
 )
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index c574979e22c..192d5ebacd0 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -164,7 +164,7 @@ def test__copy_paste(self, label_type):
             labels = torch.nn.functional.one_hot(labels, num_classes=5)
         target = {
             "boxes": BoundingBox(
-                torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", spatial_size=(32, 32)
+                torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", canvas_size=(32, 32)
             ),
             "masks": Mask(masks),
             "labels": label_type(labels),
@@ -179,7 +179,7 @@ def test__copy_paste(self, label_type):
             paste_labels = torch.nn.functional.one_hot(paste_labels, num_classes=5)
         paste_target = {
             "boxes": BoundingBox(
-                torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", spatial_size=(32, 32)
+                torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", canvas_size=(32, 32)
             ),
             "masks": Mask(paste_masks),
             "labels": label_type(paste_labels),
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 3743581794f..0fb775188cb 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -175,20 +175,20 @@ class TestSmoke:
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_common(self, transform, adapter, container_type, image_or_video, device):
-        spatial_size = F.get_spatial_size(image_or_video)
+        size = F.get_size(image_or_video)
         input = dict(
             image_or_video=image_or_video,
-            image_datapoint=make_image(size=spatial_size),
-            video_datapoint=make_video(size=spatial_size),
-            image_pil=next(make_pil_images(sizes=[spatial_size], color_spaces=["RGB"])),
+            image_datapoint=make_image(size=size),
+            video_datapoint=make_video(size=size),
+            image_pil=next(make_pil_images(sizes=[size], color_spaces=["RGB"])),
             bounding_box_xyxy=make_bounding_box(
-                format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(3,)
+                format=datapoints.BoundingBoxFormat.XYXY, canvas_size=size, batch_dims=(3,)
             ),
             bounding_box_xywh=make_bounding_box(
-                format=datapoints.BoundingBoxFormat.XYWH, spatial_size=spatial_size, batch_dims=(4,)
+                format=datapoints.BoundingBoxFormat.XYWH, canvas_size=size, batch_dims=(4,)
             ),
             bounding_box_cxcywh=make_bounding_box(
-                format=datapoints.BoundingBoxFormat.CXCYWH, spatial_size=spatial_size, batch_dims=(5,)
+                format=datapoints.BoundingBoxFormat.CXCYWH, canvas_size=size, batch_dims=(5,)
             ),
             bounding_box_degenerate_xyxy=datapoints.BoundingBox(
                 [
@@ -200,7 +200,7 @@ def test_common(self, transform, adapter, container_type, image_or_video, device
                     [2, 2, 1, 1],  # x1 > x2, y1 > y2
                 ],
                 format=datapoints.BoundingBoxFormat.XYXY,
-                spatial_size=spatial_size,
+                canvas_size=size,
             ),
             bounding_box_degenerate_xywh=datapoints.BoundingBox(
                 [
@@ -212,7 +212,7 @@ def test_common(self, transform, adapter, container_type, image_or_video, device
                     [0, 0, -1, -1],  # negative height and width
                 ],
                 format=datapoints.BoundingBoxFormat.XYWH,
-                spatial_size=spatial_size,
+                canvas_size=size,
             ),
             bounding_box_degenerate_cxcywh=datapoints.BoundingBox(
                 [
@@ -224,10 +224,10 @@ def test_common(self, transform, adapter, container_type, image_or_video, device
                     [0, 0, -1, -1],  # negative height and width
                 ],
                 format=datapoints.BoundingBoxFormat.CXCYWH,
-                spatial_size=spatial_size,
+                canvas_size=size,
             ),
-            detection_mask=make_detection_mask(size=spatial_size),
-            segmentation_mask=make_segmentation_mask(size=spatial_size),
+            detection_mask=make_detection_mask(size=size),
+            segmentation_mask=make_segmentation_mask(size=size),
             int=0,
             float=0.0,
             bool=True,
@@ -272,7 +272,7 @@ def test_common(self, transform, adapter, container_type, image_or_video, device
         # TODO: we should test that against all degenerate boxes above
         for format in list(datapoints.BoundingBoxFormat):
             sample = dict(
-                boxes=datapoints.BoundingBox([[0, 0, 0, 0]], format=format, spatial_size=(224, 244)),
+                boxes=datapoints.BoundingBox([[0, 0, 0, 0]], format=format, canvas_size=(224, 244)),
                 labels=torch.tensor([3]),
             )
             assert transforms.SanitizeBoundingBox()(sample)["boxes"].shape == (0, 4)
@@ -474,11 +474,11 @@ def test_assertions(self):
 
     @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
     @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]])
-    def test__get_params(self, fill, side_range, mocker):
+    def test__get_params(self, fill, side_range):
         transform = transforms.RandomZoomOut(fill=fill, side_range=side_range)
 
-        image = mocker.MagicMock(spec=datapoints.Image)
-        h, w = image.spatial_size = (24, 32)
+        h, w = size = (24, 32)
+        image = make_image(size)
 
         params = transform._get_params([image])
 
@@ -491,9 +491,7 @@ def test__get_params(self, fill, side_range, mocker):
     @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
     @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]])
     def test__transform(self, fill, side_range, mocker):
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.spatial_size = (24, 32)
+        inpt = make_image((24, 32))
 
         transform = transforms.RandomZoomOut(fill=fill, side_range=side_range, p=1)
 
@@ -560,11 +558,9 @@ def test_assertions(self):
 
     @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]])
     @pytest.mark.parametrize("size, pad_if_needed", [((10, 10), False), ((50, 25), True)])
-    def test__get_params(self, padding, pad_if_needed, size, mocker):
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
-        h, w = image.spatial_size
+    def test__get_params(self, padding, pad_if_needed, size):
+        h, w = size = (24, 32)
+        image = make_image(size)
 
         transform = transforms.RandomCrop(size, padding=padding, pad_if_needed=pad_if_needed)
         params = transform._get_params([image])
@@ -614,21 +610,16 @@ def test__transform(self, padding, pad_if_needed, fill, padding_mode, mocker):
             output_size, padding=padding, pad_if_needed=pad_if_needed, fill=fill, padding_mode=padding_mode
         )
 
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.spatial_size = (32, 32)
+        h, w = size = (32, 32)
+        inpt = make_image(size)
 
-        expected = mocker.MagicMock(spec=datapoints.Image)
-        expected.num_channels = 3
         if isinstance(padding, int):
-            expected.spatial_size = (inpt.spatial_size[0] + padding, inpt.spatial_size[1] + padding)
+            new_size = (h + padding, w + padding)
         elif isinstance(padding, list):
-            expected.spatial_size = (
-                inpt.spatial_size[0] + sum(padding[0::2]),
-                inpt.spatial_size[1] + sum(padding[1::2]),
-            )
+            new_size = (h + sum(padding[0::2]), w + sum(padding[1::2]))
         else:
-            expected.spatial_size = inpt.spatial_size
+            new_size = size
+        expected = make_image(new_size)
         _ = mocker.patch("torchvision.transforms.v2.functional.pad", return_value=expected)
         fn_crop = mocker.patch("torchvision.transforms.v2.functional.crop")
 
@@ -704,7 +695,7 @@ def test__transform(self, kernel_size, sigma, mocker):
         fn = mocker.patch("torchvision.transforms.v2.functional.gaussian_blur")
         inpt = mocker.MagicMock(spec=datapoints.Image)
         inpt.num_channels = 3
-        inpt.spatial_size = (24, 32)
+        inpt.canvas_size = (24, 32)
 
         # vfdev-5, Feature Request: let's store params as Transform attribute
         # This could be also helpful for users
@@ -750,16 +741,14 @@ def test_assertions(self):
         with pytest.raises(TypeError, match="Got inappropriate fill arg"):
             transforms.RandomPerspective(0.5, fill="abc")
 
-    def test__get_params(self, mocker):
+    def test__get_params(self):
         dscale = 0.5
         transform = transforms.RandomPerspective(dscale)
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
+
+        image = make_image((24, 32))
 
         params = transform._get_params([image])
 
-        h, w = image.spatial_size
         assert "coefficients" in params
         assert len(params["coefficients"]) == 8
 
@@ -770,9 +759,9 @@ def test__transform(self, distortion_scale, mocker):
         transform = transforms.RandomPerspective(distortion_scale, fill=fill, interpolation=interpolation)
 
         fn = mocker.patch("torchvision.transforms.v2.functional.perspective")
-        inpt = mocker.MagicMock(spec=datapoints.Image)
-        inpt.num_channels = 3
-        inpt.spatial_size = (24, 32)
+
+        inpt = make_image((24, 32))
+
         # vfdev-5, Feature Request: let's store params as Transform attribute
         # This could be also helpful for users
         # Otherwise, we can mock transform._get_params
@@ -810,17 +799,16 @@ def test_assertions(self):
         with pytest.raises(TypeError, match="Got inappropriate fill arg"):
             transforms.ElasticTransform(1.0, 2.0, fill="abc")
 
-    def test__get_params(self, mocker):
+    def test__get_params(self):
         alpha = 2.0
         sigma = 3.0
         transform = transforms.ElasticTransform(alpha, sigma)
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
+
+        h, w = size = (24, 32)
+        image = make_image(size)
 
         params = transform._get_params([image])
 
-        h, w = image.spatial_size
         displacement = params["displacement"]
         assert displacement.shape == (1, h, w, 2)
         assert (-alpha / w <= displacement[0, ..., 0]).all() and (displacement[0, ..., 0] <= alpha / w).all()
@@ -846,7 +834,7 @@ def test__transform(self, alpha, sigma, mocker):
         fn = mocker.patch("torchvision.transforms.v2.functional.elastic")
         inpt = mocker.MagicMock(spec=datapoints.Image)
         inpt.num_channels = 3
-        inpt.spatial_size = (24, 32)
+        inpt.canvas_size = (24, 32)
 
         # Let's mock transform._get_params to control the output:
         transform._get_params = mocker.MagicMock()
@@ -857,7 +845,7 @@ def test__transform(self, alpha, sigma, mocker):
 
 
 class TestRandomErasing:
-    def test_assertions(self, mocker):
+    def test_assertions(self):
         with pytest.raises(TypeError, match="Argument value should be either a number or str or a sequence"):
             transforms.RandomErasing(value={})
 
@@ -873,9 +861,7 @@ def test_assertions(self, mocker):
         with pytest.raises(ValueError, match="Scale should be between 0 and 1"):
             transforms.RandomErasing(scale=[-1, 2])
 
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
+        image = make_image((24, 32))
 
         transform = transforms.RandomErasing(value=[1, 2, 3, 4])
 
@@ -883,10 +869,9 @@ def test_assertions(self, mocker):
             transform._get_params([image])
 
     @pytest.mark.parametrize("value", [5.0, [1, 2, 3], "random"])
-    def test__get_params(self, value, mocker):
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
+    def test__get_params(self, value):
+        image = make_image((24, 32))
+        num_channels, height, width = F.get_dimensions(image)
 
         transform = transforms.RandomErasing(value=value)
         params = transform._get_params([image])
@@ -896,14 +881,14 @@ def test__get_params(self, value, mocker):
         i, j = params["i"], params["j"]
         assert isinstance(v, torch.Tensor)
         if value == "random":
-            assert v.shape == (image.num_channels, h, w)
+            assert v.shape == (num_channels, h, w)
         elif isinstance(value, (int, float)):
             assert v.shape == (1, 1, 1)
         elif isinstance(value, (list, tuple)):
-            assert v.shape == (image.num_channels, 1, 1)
+            assert v.shape == (num_channels, 1, 1)
 
-        assert 0 <= i <= image.spatial_size[0] - h
-        assert 0 <= j <= image.spatial_size[1] - w
+        assert 0 <= i <= height - h
+        assert 0 <= j <= width - w
 
     @pytest.mark.parametrize("p", [0, 1])
     def test__transform(self, mocker, p):
@@ -1062,14 +1047,13 @@ def test_assertions(self):
 class TestRandomIoUCrop:
     @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("options", [[0.5, 0.9], [2.0]])
-    def test__get_params(self, device, options, mocker):
-        image = mocker.MagicMock(spec=datapoints.Image)
-        image.num_channels = 3
-        image.spatial_size = (24, 32)
+    def test__get_params(self, device, options):
+        orig_h, orig_w = size = (24, 32)
+        image = make_image(size)
         bboxes = datapoints.BoundingBox(
             torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]),
             format="XYXY",
-            spatial_size=image.spatial_size,
+            canvas_size=size,
             device=device,
         )
         sample = [image, bboxes]
@@ -1088,8 +1072,6 @@ def test__get_params(self, device, options, mocker):
             assert len(params["is_within_crop_area"]) > 0
             assert params["is_within_crop_area"].dtype == torch.bool
 
-            orig_h = image.spatial_size[0]
-            orig_w = image.spatial_size[1]
             assert int(transform.min_scale * orig_h) <= params["height"] <= int(transform.max_scale * orig_h)
             assert int(transform.min_scale * orig_w) <= params["width"] <= int(transform.max_scale * orig_w)
 
@@ -1104,7 +1086,7 @@ def test__get_params(self, device, options, mocker):
     def test__transform_empty_params(self, mocker):
         transform = transforms.RandomIoUCrop(sampler_options=[2.0])
         image = datapoints.Image(torch.rand(1, 3, 4, 4))
-        bboxes = datapoints.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", spatial_size=(4, 4))
+        bboxes = datapoints.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", canvas_size=(4, 4))
         label = torch.tensor([1])
         sample = [image, bboxes, label]
         # Let's mock transform._get_params to control the output:
@@ -1123,9 +1105,10 @@ def test_forward_assertion(self):
     def test__transform(self, mocker):
         transform = transforms.RandomIoUCrop()
 
-        image = datapoints.Image(torch.rand(3, 32, 24))
-        bboxes = make_bounding_box(format="XYXY", spatial_size=(32, 24), batch_dims=(6,))
-        masks = make_detection_mask((32, 24), num_objects=6)
+        size = (32, 24)
+        image = make_image(size)
+        bboxes = make_bounding_box(format="XYXY", canvas_size=size, batch_dims=(6,))
+        masks = make_detection_mask(size, num_objects=6)
 
         sample = [image, bboxes, masks]
 
@@ -1156,13 +1139,14 @@ def test__transform(self, mocker):
 
 
 class TestScaleJitter:
-    def test__get_params(self, mocker):
-        spatial_size = (24, 32)
+    def test__get_params(self):
+        canvas_size = (24, 32)
         target_size = (16, 12)
         scale_range = (0.5, 1.5)
 
         transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range)
-        sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size)
+
+        sample = make_image(canvas_size)
 
         n_samples = 5
         for _ in range(n_samples):
@@ -1175,11 +1159,11 @@ def test__get_params(self, mocker):
             assert isinstance(size, tuple) and len(size) == 2
             height, width = size
 
-            r_min = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[0]
-            r_max = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[1]
+            r_min = min(target_size[1] / canvas_size[0], target_size[0] / canvas_size[1]) * scale_range[0]
+            r_max = min(target_size[1] / canvas_size[0], target_size[0] / canvas_size[1]) * scale_range[1]
 
-            assert int(spatial_size[0] * r_min) <= height <= int(spatial_size[0] * r_max)
-            assert int(spatial_size[1] * r_min) <= width <= int(spatial_size[1] * r_max)
+            assert int(canvas_size[0] * r_min) <= height <= int(canvas_size[0] * r_max)
+            assert int(canvas_size[1] * r_min) <= width <= int(canvas_size[1] * r_max)
 
     def test__transform(self, mocker):
         interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode)
@@ -1207,12 +1191,12 @@ def test__transform(self, mocker):
 
 class TestRandomShortestSize:
     @pytest.mark.parametrize("min_size,max_size", [([5, 9], 20), ([5, 9], None)])
-    def test__get_params(self, min_size, max_size, mocker):
-        spatial_size = (3, 10)
+    def test__get_params(self, min_size, max_size):
+        canvas_size = (3, 10)
 
         transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size, antialias=True)
 
-        sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size)
+        sample = make_image(canvas_size)
         params = transform._get_params([sample])
 
         assert "size" in params
@@ -1579,7 +1563,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
     boxes = torch.randint(0, min(H, W) // 2, size=(num_boxes, 4))
     boxes[:, 2:] += boxes[:, :2]
     boxes = boxes.clamp(min=0, max=min(H, W))
-    boxes = datapoints.BoundingBox(boxes, format="XYXY", spatial_size=(H, W))
+    boxes = datapoints.BoundingBox(boxes, format="XYXY", canvas_size=(H, W))
 
     masks = datapoints.Mask(torch.randint(0, 2, size=(num_boxes, H, W), dtype=torch.uint8))
 
@@ -1655,7 +1639,7 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type):
     boxes = datapoints.BoundingBox(
         boxes,
         format=datapoints.BoundingBoxFormat.XYXY,
-        spatial_size=(H, W),
+        canvas_size=(H, W),
     )
 
     masks = datapoints.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W)))
@@ -1725,7 +1709,7 @@ def test_sanitize_bounding_boxes_errors():
     good_bbox = datapoints.BoundingBox(
         [[0, 0, 10, 10]],
         format=datapoints.BoundingBoxFormat.XYXY,
-        spatial_size=(20, 20),
+        canvas_size=(20, 20),
     )
 
     with pytest.raises(ValueError, match="min_size must be >= 1"):
@@ -1756,7 +1740,7 @@ def test_sanitize_bounding_boxes_errors():
                 [[0, 0, 10, 10]],
             ],
             format=datapoints.BoundingBoxFormat.XYXY,
-            spatial_size=(20, 20),
+            canvas_size=(20, 20),
         )
         different_sizes = {"bbox": bad_bbox, "labels": torch.arange(bad_bbox.shape[0])}
         transforms.SanitizeBoundingBox()(different_sizes)
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index bf297473bc2..49d90a6306c 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -31,7 +31,7 @@
 from torchvision.transforms import functional as legacy_F
 from torchvision.transforms.v2 import functional as prototype_F
 from torchvision.transforms.v2.functional import to_image_pil
-from torchvision.transforms.v2.utils import query_spatial_size
+from torchvision.transforms.v2.utils import query_size
 
 DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=["RGB"], extra_dims=[(4,)])
 
@@ -1090,7 +1090,7 @@ def make_label(extra_dims, categories):
 
         pil_image = to_image_pil(make_image(size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -1100,7 +1100,7 @@ def make_label(extra_dims, categories):
 
         tensor_image = torch.Tensor(make_image(size=size, color_space="RGB", dtype=torch.float32))
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -1110,7 +1110,7 @@ def make_label(extra_dims, categories):
 
         datapoint_image = make_image(size=size, color_space="RGB", dtype=torch.float32)
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -1172,7 +1172,7 @@ def __init__(self, size, fill=0):
         self.fill = v2_transforms._geometry._setup_fill_arg(fill)
 
     def _get_params(self, sample):
-        height, width = query_spatial_size(sample)
+        height, width = query_size(sample)
         padding = [0, 0, max(self.size - width, 0), max(self.size - height, 0)]
         needs_padding = any(padding)
         return dict(padding=padding, needs_padding=needs_padding)
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index 465cc227107..0a0f9f48ab1 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -351,7 +351,7 @@ def test_scripted_smoke(self, info, args_kwargs, device):
             F.get_image_size,
             F.get_num_channels,
             F.get_num_frames,
-            F.get_spatial_size,
+            F.get_size,
             F.rgb_to_grayscale,
             F.uniform_temporal_subsample,
         ],
@@ -587,27 +587,27 @@ class TestClampBoundingBox:
         [
             dict(),
             dict(format=datapoints.BoundingBoxFormat.XYXY),
-            dict(spatial_size=(1, 1)),
+            dict(canvas_size=(1, 1)),
         ],
     )
     def test_simple_tensor_insufficient_metadata(self, metadata):
         simple_tensor = next(make_bounding_boxes()).as_subclass(torch.Tensor)
 
-        with pytest.raises(ValueError, match=re.escape("`format` and `spatial_size` has to be passed")):
+        with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` has to be passed")):
             F.clamp_bounding_box(simple_tensor, **metadata)
 
     @pytest.mark.parametrize(
         "metadata",
         [
             dict(format=datapoints.BoundingBoxFormat.XYXY),
-            dict(spatial_size=(1, 1)),
-            dict(format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(1, 1)),
+            dict(canvas_size=(1, 1)),
+            dict(format=datapoints.BoundingBoxFormat.XYXY, canvas_size=(1, 1)),
         ],
     )
     def test_datapoint_explicit_metadata(self, metadata):
         datapoint = next(make_bounding_boxes())
 
-        with pytest.raises(ValueError, match=re.escape("`format` and `spatial_size` must not be passed")):
+        with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` must not be passed")):
             F.clamp_bounding_box(datapoint, **metadata)
 
 
@@ -692,7 +692,7 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width,
     #     expected_bboxes.append(out_box)
 
     format = datapoints.BoundingBoxFormat.XYXY
-    spatial_size = (64, 76)
+    canvas_size = (64, 76)
     in_boxes = [
         [10.0, 15.0, 25.0, 35.0],
         [50.0, 5.0, 70.0, 22.0],
@@ -703,23 +703,23 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width,
         in_boxes = convert_format_bounding_box(in_boxes, datapoints.BoundingBoxFormat.XYXY, format)
 
     expected_bboxes = clamp_bounding_box(
-        datapoints.BoundingBox(expected_bboxes, format="XYXY", spatial_size=spatial_size)
+        datapoints.BoundingBox(expected_bboxes, format="XYXY", canvas_size=canvas_size)
     ).tolist()
 
-    output_boxes, output_spatial_size = F.crop_bounding_box(
+    output_boxes, output_canvas_size = F.crop_bounding_box(
         in_boxes,
         format,
         top,
         left,
-        spatial_size[0],
-        spatial_size[1],
+        canvas_size[0],
+        canvas_size[1],
     )
 
     if format != datapoints.BoundingBoxFormat.XYXY:
         output_boxes = convert_format_bounding_box(output_boxes, format, datapoints.BoundingBoxFormat.XYXY)
 
     torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
-    torch.testing.assert_close(output_spatial_size, spatial_size)
+    torch.testing.assert_close(output_canvas_size, canvas_size)
 
 
 @pytest.mark.parametrize("device", cpu_and_cuda())
@@ -756,7 +756,7 @@ def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_):
         return bbox
 
     format = datapoints.BoundingBoxFormat.XYXY
-    spatial_size = (100, 100)
+    canvas_size = (100, 100)
     in_boxes = [
         [10.0, 10.0, 20.0, 20.0],
         [5.0, 10.0, 15.0, 20.0],
@@ -767,18 +767,18 @@ def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_):
     expected_bboxes = torch.tensor(expected_bboxes, device=device)
 
     in_boxes = datapoints.BoundingBox(
-        in_boxes, format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, device=device
+        in_boxes, format=datapoints.BoundingBoxFormat.XYXY, canvas_size=canvas_size, device=device
     )
     if format != datapoints.BoundingBoxFormat.XYXY:
         in_boxes = convert_format_bounding_box(in_boxes, datapoints.BoundingBoxFormat.XYXY, format)
 
-    output_boxes, output_spatial_size = F.resized_crop_bounding_box(in_boxes, format, top, left, height, width, size)
+    output_boxes, output_canvas_size = F.resized_crop_bounding_box(in_boxes, format, top, left, height, width, size)
 
     if format != datapoints.BoundingBoxFormat.XYXY:
         output_boxes = convert_format_bounding_box(output_boxes, format, datapoints.BoundingBoxFormat.XYXY)
 
     torch.testing.assert_close(output_boxes, expected_bboxes)
-    torch.testing.assert_close(output_spatial_size, size)
+    torch.testing.assert_close(output_canvas_size, size)
 
 
 def _parse_padding(padding):
@@ -817,28 +817,28 @@ def _compute_expected_bbox(bbox, padding_):
             bbox = bbox.to(dtype)
         return bbox
 
-    def _compute_expected_spatial_size(bbox, padding_):
+    def _compute_expected_canvas_size(bbox, padding_):
         pad_left, pad_up, pad_right, pad_down = _parse_padding(padding_)
-        height, width = bbox.spatial_size
+        height, width = bbox.canvas_size
         return height + pad_up + pad_down, width + pad_left + pad_right
 
     for bboxes in make_bounding_boxes():
         bboxes = bboxes.to(device)
         bboxes_format = bboxes.format
-        bboxes_spatial_size = bboxes.spatial_size
+        bboxes_canvas_size = bboxes.canvas_size
 
-        output_boxes, output_spatial_size = F.pad_bounding_box(
-            bboxes, format=bboxes_format, spatial_size=bboxes_spatial_size, padding=padding
+        output_boxes, output_canvas_size = F.pad_bounding_box(
+            bboxes, format=bboxes_format, canvas_size=bboxes_canvas_size, padding=padding
         )
 
-        torch.testing.assert_close(output_spatial_size, _compute_expected_spatial_size(bboxes, padding))
+        torch.testing.assert_close(output_canvas_size, _compute_expected_canvas_size(bboxes, padding))
 
         if bboxes.ndim < 2 or bboxes.shape[0] == 0:
             bboxes = [bboxes]
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = datapoints.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
+            bbox = datapoints.BoundingBox(bbox, format=bboxes_format, canvas_size=bboxes_canvas_size)
             expected_bboxes.append(_compute_expected_bbox(bbox, padding))
 
         if len(expected_bboxes) > 1:
@@ -906,24 +906,24 @@ def _compute_expected_bbox(bbox, pcoeffs_):
         out_bbox = datapoints.BoundingBox(
             out_bbox,
             format=datapoints.BoundingBoxFormat.XYXY,
-            spatial_size=bbox.spatial_size,
+            canvas_size=bbox.canvas_size,
             dtype=bbox.dtype,
             device=bbox.device,
         )
         return clamp_bounding_box(convert_format_bounding_box(out_bbox, new_format=bbox.format))
 
-    spatial_size = (32, 38)
+    canvas_size = (32, 38)
 
     pcoeffs = _get_perspective_coeffs(startpoints, endpoints)
     inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints)
 
-    for bboxes in make_bounding_boxes(spatial_size=spatial_size, extra_dims=((4,),)):
+    for bboxes in make_bounding_boxes(spatial_size=canvas_size, extra_dims=((4,),)):
         bboxes = bboxes.to(device)
 
         output_bboxes = F.perspective_bounding_box(
             bboxes.as_subclass(torch.Tensor),
             format=bboxes.format,
-            spatial_size=bboxes.spatial_size,
+            canvas_size=bboxes.canvas_size,
             startpoints=None,
             endpoints=None,
             coefficients=pcoeffs,
@@ -934,7 +934,7 @@ def _compute_expected_bbox(bbox, pcoeffs_):
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = datapoints.BoundingBox(bbox, format=bboxes.format, spatial_size=bboxes.spatial_size)
+            bbox = datapoints.BoundingBox(bbox, format=bboxes.format, canvas_size=bboxes.canvas_size)
             expected_bboxes.append(_compute_expected_bbox(bbox, inv_pcoeffs))
         if len(expected_bboxes) > 1:
             expected_bboxes = torch.stack(expected_bboxes)
@@ -951,15 +951,15 @@ def _compute_expected_bbox(bbox, pcoeffs_):
 def test_correctness_center_crop_bounding_box(device, output_size):
     def _compute_expected_bbox(bbox, output_size_):
         format_ = bbox.format
-        spatial_size_ = bbox.spatial_size
+        canvas_size_ = bbox.canvas_size
         dtype = bbox.dtype
         bbox = convert_format_bounding_box(bbox.float(), format_, datapoints.BoundingBoxFormat.XYWH)
 
         if len(output_size_) == 1:
             output_size_.append(output_size_[-1])
 
-        cy = int(round((spatial_size_[0] - output_size_[0]) * 0.5))
-        cx = int(round((spatial_size_[1] - output_size_[1]) * 0.5))
+        cy = int(round((canvas_size_[0] - output_size_[0]) * 0.5))
+        cx = int(round((canvas_size_[1] - output_size_[1]) * 0.5))
         out_bbox = [
             bbox[0].item() - cx,
             bbox[1].item() - cy,
@@ -968,16 +968,16 @@ def _compute_expected_bbox(bbox, output_size_):
         ]
         out_bbox = torch.tensor(out_bbox)
         out_bbox = convert_format_bounding_box(out_bbox, datapoints.BoundingBoxFormat.XYWH, format_)
-        out_bbox = clamp_bounding_box(out_bbox, format=format_, spatial_size=output_size)
+        out_bbox = clamp_bounding_box(out_bbox, format=format_, canvas_size=output_size)
         return out_bbox.to(dtype=dtype, device=bbox.device)
 
     for bboxes in make_bounding_boxes(extra_dims=((4,),)):
         bboxes = bboxes.to(device)
         bboxes_format = bboxes.format
-        bboxes_spatial_size = bboxes.spatial_size
+        bboxes_canvas_size = bboxes.canvas_size
 
-        output_boxes, output_spatial_size = F.center_crop_bounding_box(
-            bboxes, bboxes_format, bboxes_spatial_size, output_size
+        output_boxes, output_canvas_size = F.center_crop_bounding_box(
+            bboxes, bboxes_format, bboxes_canvas_size, output_size
         )
 
         if bboxes.ndim < 2:
@@ -985,7 +985,7 @@ def _compute_expected_bbox(bbox, output_size_):
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = datapoints.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
+            bbox = datapoints.BoundingBox(bbox, format=bboxes_format, canvas_size=bboxes_canvas_size)
             expected_bboxes.append(_compute_expected_bbox(bbox, output_size))
 
         if len(expected_bboxes) > 1:
@@ -994,7 +994,7 @@ def _compute_expected_bbox(bbox, output_size_):
             expected_bboxes = expected_bboxes[0]
 
         torch.testing.assert_close(output_boxes, expected_bboxes, atol=1, rtol=0)
-        torch.testing.assert_close(output_spatial_size, output_size)
+        torch.testing.assert_close(output_canvas_size, output_size)
 
 
 @pytest.mark.parametrize("device", cpu_and_cuda())
@@ -1022,11 +1022,11 @@ def _compute_expected_mask(mask, output_size):
 
 # Copied from test/test_functional_tensor.py
 @pytest.mark.parametrize("device", cpu_and_cuda())
-@pytest.mark.parametrize("spatial_size", ("small", "large"))
+@pytest.mark.parametrize("canvas_size", ("small", "large"))
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)])
 @pytest.mark.parametrize("sigma", [[0.5, 0.5], (0.5, 0.5), (0.8, 0.8), (1.7, 1.7)])
-def test_correctness_gaussian_blur_image_tensor(device, spatial_size, dt, ksize, sigma):
+def test_correctness_gaussian_blur_image_tensor(device, canvas_size, dt, ksize, sigma):
     fn = F.gaussian_blur_image_tensor
 
     # true_cv2_results = {
@@ -1046,7 +1046,7 @@ def test_correctness_gaussian_blur_image_tensor(device, spatial_size, dt, ksize,
     p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "gaussian_blur_opencv_results.pt")
     true_cv2_results = torch.load(p)
 
-    if spatial_size == "small":
+    if canvas_size == "small":
         tensor = (
             torch.from_numpy(np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3))).permute(2, 0, 1).to(device)
         )
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 69180b99dbc..a8dc079e602 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -380,7 +380,7 @@ def assert_warns_antialias_default_value():
         yield
 
 
-def reference_affine_bounding_box_helper(bounding_box, *, format, spatial_size, affine_matrix):
+def reference_affine_bounding_box_helper(bounding_box, *, format, canvas_size, affine_matrix):
     def transform(bbox):
         # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
         in_dtype = bbox.dtype
@@ -414,7 +414,7 @@ def transform(bbox):
             out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
         )
         # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
-        out_bbox = F.clamp_bounding_box(out_bbox, format=format, spatial_size=spatial_size)
+        out_bbox = F.clamp_bounding_box(out_bbox, format=format, canvas_size=canvas_size)
         out_bbox = out_bbox.to(dtype=in_dtype)
         return out_bbox
 
@@ -501,15 +501,15 @@ def test_kernel_bounding_box(self, format, size, use_max_size, dtype, device):
             return
 
         bounding_box = make_bounding_box(
+            self.INPUT_SIZE,
             format=format,
-            spatial_size=self.INPUT_SIZE,
             dtype=dtype,
             device=device,
         )
         check_kernel(
             F.resize_bounding_box,
             bounding_box,
-            spatial_size=bounding_box.spatial_size,
+            canvas_size=bounding_box.canvas_size,
             size=size,
             **max_size_kwarg,
             check_scripted_vs_eager=not isinstance(size, int),
@@ -576,8 +576,8 @@ def test_transform(self, size, device, make_input):
         check_transform(transforms.Resize, make_input(self.INPUT_SIZE, device=device), size=size, antialias=True)
 
     def _check_output_size(self, input, output, *, size, max_size):
-        assert tuple(F.get_spatial_size(output)) == self._compute_output_size(
-            input_size=F.get_spatial_size(input), size=size, max_size=max_size
+        assert tuple(F.get_size(output)) == self._compute_output_size(
+            input_size=F.get_size(input), size=size, max_size=max_size
         )
 
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
@@ -601,9 +601,9 @@ def test_image_correctness(self, size, interpolation, use_max_size, fn):
         torch.testing.assert_close(actual, expected, atol=1, rtol=0)
 
     def _reference_resize_bounding_box(self, bounding_box, *, size, max_size=None):
-        old_height, old_width = bounding_box.spatial_size
+        old_height, old_width = bounding_box.canvas_size
         new_height, new_width = self._compute_output_size(
-            input_size=bounding_box.spatial_size, size=size, max_size=max_size
+            input_size=bounding_box.canvas_size, size=size, max_size=max_size
         )
 
         if (old_height, old_width) == (new_height, new_width):
@@ -620,10 +620,10 @@ def _reference_resize_bounding_box(self, bounding_box, *, size, max_size=None):
         expected_bboxes = reference_affine_bounding_box_helper(
             bounding_box,
             format=bounding_box.format,
-            spatial_size=(new_height, new_width),
+            canvas_size=(new_height, new_width),
             affine_matrix=affine_matrix,
         )
-        return datapoints.BoundingBox.wrap_like(bounding_box, expected_bboxes, spatial_size=(new_height, new_width))
+        return datapoints.BoundingBox.wrap_like(bounding_box, expected_bboxes, canvas_size=(new_height, new_width))
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
@@ -633,7 +633,7 @@ def test_bounding_box_correctness(self, format, size, use_max_size, fn):
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        bounding_box = make_bounding_box(format=format, spatial_size=self.INPUT_SIZE)
+        bounding_box = make_bounding_box(self.INPUT_SIZE, format=format)
 
         actual = fn(bounding_box, size=size, **max_size_kwarg)
         expected = self._reference_resize_bounding_box(bounding_box, size=size, **max_size_kwarg)
@@ -750,7 +750,7 @@ def test_transform_unknown_size_error(self):
     def test_noop(self, size, make_input):
         input = make_input(self.INPUT_SIZE)
 
-        output = F.resize(input, size=F.get_spatial_size(input), antialias=True)
+        output = F.resize(input, size=F.get_size(input), antialias=True)
 
         # This identity check is not a requirement. It is here to avoid breaking the behavior by accident. If there
         # is a good reason to break this, feel free to downgrade to an equality check.
@@ -780,11 +780,11 @@ def test_no_regression_5405(self, make_input):
 
         input = make_input(self.INPUT_SIZE)
 
-        size = min(F.get_spatial_size(input))
+        size = min(F.get_size(input))
         max_size = size + 1
         output = F.resize(input, size=size, max_size=max_size, antialias=True)
 
-        assert max(F.get_spatial_size(output)) == max_size
+        assert max(F.get_size(output)) == max_size
 
 
 class TestHorizontalFlip:
@@ -802,7 +802,7 @@ def test_kernel_bounding_box(self, format, dtype, device):
             F.horizontal_flip_bounding_box,
             bounding_box,
             format=format,
-            spatial_size=bounding_box.spatial_size,
+            canvas_size=bounding_box.canvas_size,
         )
 
     @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
@@ -862,7 +862,7 @@ def test_image_correctness(self, fn):
     def _reference_horizontal_flip_bounding_box(self, bounding_box):
         affine_matrix = np.array(
             [
-                [-1, 0, bounding_box.spatial_size[1]],
+                [-1, 0, bounding_box.canvas_size[1]],
                 [0, 1, 0],
             ],
             dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
@@ -871,7 +871,7 @@ def _reference_horizontal_flip_bounding_box(self, bounding_box):
         expected_bboxes = reference_affine_bounding_box_helper(
             bounding_box,
             format=bounding_box.format,
-            spatial_size=bounding_box.spatial_size,
+            canvas_size=bounding_box.canvas_size,
             affine_matrix=affine_matrix,
         )
 
@@ -983,7 +983,7 @@ def test_kernel_bounding_box(self, param, value, format, dtype, device):
             F.affine_bounding_box,
             bounding_box,
             format=format,
-            spatial_size=bounding_box.spatial_size,
+            canvas_size=bounding_box.canvas_size,
             **{param: value},
             check_scripted_vs_eager=not (param == "shear" and isinstance(value, (int, float))),
         )
@@ -1121,7 +1121,7 @@ def _compute_affine_matrix(self, *, angle, translate, scale, shear, center):
 
     def _reference_affine_bounding_box(self, bounding_box, *, angle, translate, scale, shear, center):
         if center is None:
-            center = [s * 0.5 for s in bounding_box.spatial_size[::-1]]
+            center = [s * 0.5 for s in bounding_box.canvas_size[::-1]]
 
         affine_matrix = self._compute_affine_matrix(
             angle=angle, translate=translate, scale=scale, shear=shear, center=center
@@ -1131,7 +1131,7 @@ def _reference_affine_bounding_box(self, bounding_box, *, angle, translate, scal
         expected_bboxes = reference_affine_bounding_box_helper(
             bounding_box,
             format=bounding_box.format,
-            spatial_size=bounding_box.spatial_size,
+            canvas_size=bounding_box.canvas_size,
             affine_matrix=affine_matrix,
         )
 
@@ -1190,7 +1190,7 @@ def test_transform_bounding_box_correctness(self, format, center, seed):
     @pytest.mark.parametrize("seed", list(range(10)))
     def test_transform_get_params_bounds(self, degrees, translate, scale, shear, seed):
         image = make_image()
-        height, width = F.get_spatial_size(image)
+        height, width = F.get_size(image)
 
         transform = transforms.RandomAffine(degrees=degrees, translate=translate, scale=scale, shear=shear)
 
@@ -1281,7 +1281,7 @@ def test_kernel_bounding_box(self, format, dtype, device):
             F.vertical_flip_bounding_box,
             bounding_box,
             format=format,
-            spatial_size=bounding_box.spatial_size,
+            canvas_size=bounding_box.canvas_size,
         )
 
     @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
@@ -1340,7 +1340,7 @@ def _reference_vertical_flip_bounding_box(self, bounding_box):
         affine_matrix = np.array(
             [
                 [1, 0, 0],
-                [0, -1, bounding_box.spatial_size[0]],
+                [0, -1, bounding_box.canvas_size[0]],
             ],
             dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
         )
@@ -1348,7 +1348,7 @@ def _reference_vertical_flip_bounding_box(self, bounding_box):
         expected_bboxes = reference_affine_bounding_box_helper(
             bounding_box,
             format=bounding_box.format,
-            spatial_size=bounding_box.spatial_size,
+            canvas_size=bounding_box.canvas_size,
             affine_matrix=affine_matrix,
         )
 
@@ -1437,7 +1437,7 @@ def test_kernel_bounding_box(self, param, value, format, dtype, device):
             F.rotate_bounding_box,
             bounding_box,
             format=format,
-            spatial_size=bounding_box.spatial_size,
+            canvas_size=bounding_box.canvas_size,
             **kwargs,
         )
 
@@ -1543,7 +1543,7 @@ def _reference_rotate_bounding_box(self, bounding_box, *, angle, expand, center)
             raise ValueError("This reference currently does not support expand=True")
 
         if center is None:
-            center = [s * 0.5 for s in bounding_box.spatial_size[::-1]]
+            center = [s * 0.5 for s in bounding_box.canvas_size[::-1]]
 
         a = np.cos(angle * np.pi / 180.0)
         b = np.sin(angle * np.pi / 180.0)
@@ -1560,7 +1560,7 @@ def _reference_rotate_bounding_box(self, bounding_box, *, angle, expand, center)
         expected_bboxes = reference_affine_bounding_box_helper(
             bounding_box,
             format=bounding_box.format,
-            spatial_size=bounding_box.spatial_size,
+            canvas_size=bounding_box.canvas_size,
             affine_matrix=affine_matrix,
         )
 
diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py
index 198ab39a475..7ed0cac9376 100644
--- a/test/test_transforms_v2_utils.py
+++ b/test/test_transforms_v2_utils.py
@@ -4,16 +4,16 @@
 import torch
 
 import torchvision.transforms.v2.utils
-from common_utils import make_bounding_box, make_detection_mask, make_image
+from common_utils import DEFAULT_SIZE, make_bounding_box, make_detection_mask, make_image
 
 from torchvision import datapoints
 from torchvision.transforms.v2.functional import to_image_pil
 from torchvision.transforms.v2.utils import has_all, has_any
 
 
-IMAGE = make_image(color_space="RGB")
-BOUNDING_BOX = make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, spatial_size=IMAGE.spatial_size)
-MASK = make_detection_mask(size=IMAGE.spatial_size)
+IMAGE = make_image(DEFAULT_SIZE, color_space="RGB")
+BOUNDING_BOX = make_bounding_box(DEFAULT_SIZE, format=datapoints.BoundingBoxFormat.XYXY)
+MASK = make_detection_mask(DEFAULT_SIZE)
 
 
 @pytest.mark.parametrize(
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index dc04fbfc7a9..dd80ca9e5ac 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -186,8 +186,8 @@ def float32_vs_uint8_fill_adapter(other_args, kwargs):
     return other_args, dict(kwargs, fill=fill)
 
 
-def reference_affine_bounding_box_helper(bounding_box, *, format, spatial_size, affine_matrix):
-    def transform(bbox, affine_matrix_, format_, spatial_size_):
+def reference_affine_bounding_box_helper(bounding_box, *, format, canvas_size, affine_matrix):
+    def transform(bbox, affine_matrix_, format_, canvas_size_):
         # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
         in_dtype = bbox.dtype
         if not torch.is_floating_point(bbox):
@@ -220,14 +220,14 @@ def transform(bbox, affine_matrix_, format_, spatial_size_):
             out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_, inplace=True
         )
         # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
-        out_bbox = F.clamp_bounding_box(out_bbox, format=format_, spatial_size=spatial_size_)
+        out_bbox = F.clamp_bounding_box(out_bbox, format=format_, canvas_size=canvas_size_)
         out_bbox = out_bbox.to(dtype=in_dtype)
         return out_bbox
 
     if bounding_box.ndim < 2:
         bounding_box = [bounding_box]
 
-    expected_bboxes = [transform(bbox, affine_matrix, format, spatial_size) for bbox in bounding_box]
+    expected_bboxes = [transform(bbox, affine_matrix, format, canvas_size) for bbox in bounding_box]
     if len(expected_bboxes) > 1:
         expected_bboxes = torch.stack(expected_bboxes)
     else:
@@ -323,11 +323,11 @@ def reference_crop_bounding_box(bounding_box, *, format, top, left, height, widt
         dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
     )
 
-    spatial_size = (height, width)
+    canvas_size = (height, width)
     expected_bboxes = reference_affine_bounding_box_helper(
-        bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
+        bounding_box, format=format, canvas_size=canvas_size, affine_matrix=affine_matrix
     )
-    return expected_bboxes, spatial_size
+    return expected_bboxes, canvas_size
 
 
 def reference_inputs_crop_bounding_box():
@@ -509,7 +509,7 @@ def sample_inputs_pad_bounding_box():
         yield ArgsKwargs(
             bounding_box_loader,
             format=bounding_box_loader.format,
-            spatial_size=bounding_box_loader.spatial_size,
+            canvas_size=bounding_box_loader.canvas_size,
             padding=padding,
             padding_mode="constant",
         )
@@ -532,7 +532,7 @@ def sample_inputs_pad_video():
         yield ArgsKwargs(video_loader, padding=[1])
 
 
-def reference_pad_bounding_box(bounding_box, *, format, spatial_size, padding, padding_mode):
+def reference_pad_bounding_box(bounding_box, *, format, canvas_size, padding, padding_mode):
 
     left, right, top, bottom = _parse_pad_padding(padding)
 
@@ -544,11 +544,11 @@ def reference_pad_bounding_box(bounding_box, *, format, spatial_size, padding, p
         dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
     )
 
-    height = spatial_size[0] + top + bottom
-    width = spatial_size[1] + left + right
+    height = canvas_size[0] + top + bottom
+    width = canvas_size[1] + left + right
 
     expected_bboxes = reference_affine_bounding_box_helper(
-        bounding_box, format=format, spatial_size=(height, width), affine_matrix=affine_matrix
+        bounding_box, format=format, canvas_size=(height, width), affine_matrix=affine_matrix
     )
     return expected_bboxes, (height, width)
 
@@ -560,7 +560,7 @@ def reference_inputs_pad_bounding_box():
         yield ArgsKwargs(
             bounding_box_loader,
             format=bounding_box_loader.format,
-            spatial_size=bounding_box_loader.spatial_size,
+            canvas_size=bounding_box_loader.canvas_size,
             padding=padding,
             padding_mode="constant",
         )
@@ -662,7 +662,7 @@ def sample_inputs_perspective_bounding_box():
         yield ArgsKwargs(
             bounding_box_loader,
             format=bounding_box_loader.format,
-            spatial_size=bounding_box_loader.spatial_size,
+            canvas_size=bounding_box_loader.canvas_size,
             startpoints=None,
             endpoints=None,
             coefficients=_PERSPECTIVE_COEFFS[0],
@@ -671,7 +671,7 @@ def sample_inputs_perspective_bounding_box():
     format = datapoints.BoundingBoxFormat.XYXY
     loader = make_bounding_box_loader(format=format)
     yield ArgsKwargs(
-        loader, format=format, spatial_size=loader.spatial_size, startpoints=_STARTPOINTS, endpoints=_ENDPOINTS
+        loader, format=format, canvas_size=loader.canvas_size, startpoints=_STARTPOINTS, endpoints=_ENDPOINTS
     )
 
 
@@ -744,13 +744,13 @@ def sample_inputs_perspective_video():
 )
 
 
-def _get_elastic_displacement(spatial_size):
-    return torch.rand(1, *spatial_size, 2)
+def _get_elastic_displacement(canvas_size):
+    return torch.rand(1, *canvas_size, 2)
 
 
 def sample_inputs_elastic_image_tensor():
     for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]):
-        displacement = _get_elastic_displacement(image_loader.spatial_size)
+        displacement = _get_elastic_displacement(image_loader.canvas_size)
         for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
             yield ArgsKwargs(image_loader, displacement=displacement, fill=fill)
 
@@ -764,18 +764,18 @@ def reference_inputs_elastic_image_tensor():
             F.InterpolationMode.BICUBIC,
         ],
     ):
-        displacement = _get_elastic_displacement(image_loader.spatial_size)
+        displacement = _get_elastic_displacement(image_loader.canvas_size)
         for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
             yield ArgsKwargs(image_loader, interpolation=interpolation, displacement=displacement, fill=fill)
 
 
 def sample_inputs_elastic_bounding_box():
     for bounding_box_loader in make_bounding_box_loaders():
-        displacement = _get_elastic_displacement(bounding_box_loader.spatial_size)
+        displacement = _get_elastic_displacement(bounding_box_loader.canvas_size)
         yield ArgsKwargs(
             bounding_box_loader,
             format=bounding_box_loader.format,
-            spatial_size=bounding_box_loader.spatial_size,
+            canvas_size=bounding_box_loader.canvas_size,
             displacement=displacement,
         )
 
@@ -852,7 +852,7 @@ def sample_inputs_center_crop_bounding_box():
         yield ArgsKwargs(
             bounding_box_loader,
             format=bounding_box_loader.format,
-            spatial_size=bounding_box_loader.spatial_size,
+            canvas_size=bounding_box_loader.canvas_size,
             output_size=output_size,
         )
 
@@ -977,7 +977,7 @@ def make_beta_distributed_image(shape, dtype, device, *, alpha, beta, memory_for
             image.mul_(torch.iinfo(dtype).max).round_()
         return image.to(dtype=dtype, device=device, memory_format=memory_format, copy=True)
 
-    spatial_size = (256, 256)
+    canvas_size = (256, 256)
     for dtype, color_space, fn in itertools.product(
         [torch.uint8],
         ["GRAY", "RGB"],
@@ -1007,7 +1007,7 @@ def make_beta_distributed_image(shape, dtype, device, *, alpha, beta, memory_for
             ],
         ],
     ):
-        image_loader = ImageLoader(fn, shape=(get_num_channels(color_space), *spatial_size), dtype=dtype)
+        image_loader = ImageLoader(fn, shape=(get_num_channels(color_space), *canvas_size), dtype=dtype)
         yield ArgsKwargs(image_loader)
 
 
@@ -1489,7 +1489,7 @@ def sample_inputs_clamp_bounding_box():
         yield ArgsKwargs(
             bounding_box_loader,
             format=bounding_box_loader.format,
-            spatial_size=bounding_box_loader.spatial_size,
+            canvas_size=bounding_box_loader.canvas_size,
         )
 
 
@@ -1504,7 +1504,7 @@ def sample_inputs_clamp_bounding_box():
 _FIVE_TEN_CROP_SIZES = [7, (6,), [5], (6, 5), [7, 6]]
 
 
-def _get_five_ten_crop_spatial_size(size):
+def _get_five_ten_crop_canvas_size(size):
     if isinstance(size, int):
         crop_height = crop_width = size
     elif len(size) == 1:
@@ -1517,7 +1517,7 @@ def _get_five_ten_crop_spatial_size(size):
 def sample_inputs_five_crop_image_tensor():
     for size in _FIVE_TEN_CROP_SIZES:
         for image_loader in make_image_loaders(
-            sizes=[_get_five_ten_crop_spatial_size(size)],
+            sizes=[_get_five_ten_crop_canvas_size(size)],
             color_spaces=["RGB"],
             dtypes=[torch.float32],
         ):
@@ -1527,21 +1527,21 @@ def sample_inputs_five_crop_image_tensor():
 def reference_inputs_five_crop_image_tensor():
     for size in _FIVE_TEN_CROP_SIZES:
         for image_loader in make_image_loaders(
-            sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()], dtypes=[torch.uint8]
+            sizes=[_get_five_ten_crop_canvas_size(size)], extra_dims=[()], dtypes=[torch.uint8]
         ):
             yield ArgsKwargs(image_loader, size=size)
 
 
 def sample_inputs_five_crop_video():
     size = _FIVE_TEN_CROP_SIZES[0]
-    for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_spatial_size(size)]):
+    for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_canvas_size(size)]):
         yield ArgsKwargs(video_loader, size=size)
 
 
 def sample_inputs_ten_crop_image_tensor():
     for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
         for image_loader in make_image_loaders(
-            sizes=[_get_five_ten_crop_spatial_size(size)],
+            sizes=[_get_five_ten_crop_canvas_size(size)],
             color_spaces=["RGB"],
             dtypes=[torch.float32],
         ):
@@ -1551,14 +1551,14 @@ def sample_inputs_ten_crop_image_tensor():
 def reference_inputs_ten_crop_image_tensor():
     for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
         for image_loader in make_image_loaders(
-            sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()], dtypes=[torch.uint8]
+            sizes=[_get_five_ten_crop_canvas_size(size)], extra_dims=[()], dtypes=[torch.uint8]
         ):
             yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip)
 
 
 def sample_inputs_ten_crop_video():
     size = _FIVE_TEN_CROP_SIZES[0]
-    for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_spatial_size(size)]):
+    for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_canvas_size(size)]):
         yield ArgsKwargs(video_loader, size=size)
 
 
diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py
index 11d42f171e4..3ab579713d1 100644
--- a/torchvision/datapoints/_bounding_box.py
+++ b/torchvision/datapoints/_bounding_box.py
@@ -30,7 +30,7 @@ class BoundingBox(Datapoint):
     Args:
         data: Any data that can be turned into a tensor with :func:`torch.as_tensor`.
         format (BoundingBoxFormat, str): Format of the bounding box.
-        spatial_size (two-tuple of ints): Height and width of the corresponding image or video.
+        canvas_size (two-tuple of ints): Height and width of the corresponding image or video.
         dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
             ``data``.
         device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
@@ -40,13 +40,13 @@ class BoundingBox(Datapoint):
     """
 
     format: BoundingBoxFormat
-    spatial_size: Tuple[int, int]
+    canvas_size: Tuple[int, int]
 
     @classmethod
-    def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, spatial_size: Tuple[int, int]) -> BoundingBox:
+    def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, canvas_size: Tuple[int, int]) -> BoundingBox:
         bounding_box = tensor.as_subclass(cls)
         bounding_box.format = format
-        bounding_box.spatial_size = spatial_size
+        bounding_box.canvas_size = canvas_size
         return bounding_box
 
     def __new__(
@@ -54,7 +54,7 @@ def __new__(
         data: Any,
         *,
         format: Union[BoundingBoxFormat, str],
-        spatial_size: Tuple[int, int],
+        canvas_size: Tuple[int, int],
         dtype: Optional[torch.dtype] = None,
         device: Optional[Union[torch.device, str, int]] = None,
         requires_grad: Optional[bool] = None,
@@ -64,7 +64,7 @@ def __new__(
         if isinstance(format, str):
             format = BoundingBoxFormat[format.upper()]
 
-        return cls._wrap(tensor, format=format, spatial_size=spatial_size)
+        return cls._wrap(tensor, format=format, canvas_size=canvas_size)
 
     @classmethod
     def wrap_like(
@@ -73,7 +73,7 @@ def wrap_like(
         tensor: torch.Tensor,
         *,
         format: Optional[BoundingBoxFormat] = None,
-        spatial_size: Optional[Tuple[int, int]] = None,
+        canvas_size: Optional[Tuple[int, int]] = None,
     ) -> BoundingBox:
         """Wrap a :class:`torch.Tensor` as :class:`BoundingBox` from a reference.
 
@@ -82,7 +82,7 @@ def wrap_like(
             tensor (Tensor): Tensor to be wrapped as :class:`BoundingBox`
             format (BoundingBoxFormat, str, optional): Format of the bounding box.  If omitted, it is taken from the
                 reference.
-            spatial_size (two-tuple of ints, optional): Height and width of the corresponding image or video. If
+            canvas_size (two-tuple of ints, optional): Height and width of the corresponding image or video. If
                 omitted, it is taken from the reference.
 
         """
@@ -92,21 +92,21 @@ def wrap_like(
         return cls._wrap(
             tensor,
             format=format if format is not None else other.format,
-            spatial_size=spatial_size if spatial_size is not None else other.spatial_size,
+            canvas_size=canvas_size if canvas_size is not None else other.canvas_size,
         )
 
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
-        return self._make_repr(format=self.format, spatial_size=self.spatial_size)
+        return self._make_repr(format=self.format, canvas_size=self.canvas_size)
 
     def horizontal_flip(self) -> BoundingBox:
         output = self._F.horizontal_flip_bounding_box(
-            self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size
+            self.as_subclass(torch.Tensor), format=self.format, canvas_size=self.canvas_size
         )
         return BoundingBox.wrap_like(self, output)
 
     def vertical_flip(self) -> BoundingBox:
         output = self._F.vertical_flip_bounding_box(
-            self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size
+            self.as_subclass(torch.Tensor), format=self.format, canvas_size=self.canvas_size
         )
         return BoundingBox.wrap_like(self, output)
 
@@ -117,25 +117,25 @@ def resize(  # type: ignore[override]
         max_size: Optional[int] = None,
         antialias: Optional[Union[str, bool]] = "warn",
     ) -> BoundingBox:
-        output, spatial_size = self._F.resize_bounding_box(
+        output, canvas_size = self._F.resize_bounding_box(
             self.as_subclass(torch.Tensor),
-            spatial_size=self.spatial_size,
+            canvas_size=self.canvas_size,
             size=size,
             max_size=max_size,
         )
-        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
+        return BoundingBox.wrap_like(self, output, canvas_size=canvas_size)
 
     def crop(self, top: int, left: int, height: int, width: int) -> BoundingBox:
-        output, spatial_size = self._F.crop_bounding_box(
+        output, canvas_size = self._F.crop_bounding_box(
             self.as_subclass(torch.Tensor), self.format, top=top, left=left, height=height, width=width
         )
-        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
+        return BoundingBox.wrap_like(self, output, canvas_size=canvas_size)
 
     def center_crop(self, output_size: List[int]) -> BoundingBox:
-        output, spatial_size = self._F.center_crop_bounding_box(
-            self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size, output_size=output_size
+        output, canvas_size = self._F.center_crop_bounding_box(
+            self.as_subclass(torch.Tensor), format=self.format, canvas_size=self.canvas_size, output_size=output_size
         )
-        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
+        return BoundingBox.wrap_like(self, output, canvas_size=canvas_size)
 
     def resized_crop(
         self,
@@ -147,10 +147,10 @@ def resized_crop(
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         antialias: Optional[Union[str, bool]] = "warn",
     ) -> BoundingBox:
-        output, spatial_size = self._F.resized_crop_bounding_box(
+        output, canvas_size = self._F.resized_crop_bounding_box(
             self.as_subclass(torch.Tensor), self.format, top, left, height, width, size=size
         )
-        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
+        return BoundingBox.wrap_like(self, output, canvas_size=canvas_size)
 
     def pad(
         self,
@@ -158,14 +158,14 @@ def pad(
         fill: Optional[Union[int, float, List[float]]] = None,
         padding_mode: str = "constant",
     ) -> BoundingBox:
-        output, spatial_size = self._F.pad_bounding_box(
+        output, canvas_size = self._F.pad_bounding_box(
             self.as_subclass(torch.Tensor),
             format=self.format,
-            spatial_size=self.spatial_size,
+            canvas_size=self.canvas_size,
             padding=padding,
             padding_mode=padding_mode,
         )
-        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
+        return BoundingBox.wrap_like(self, output, canvas_size=canvas_size)
 
     def rotate(
         self,
@@ -175,15 +175,15 @@ def rotate(
         center: Optional[List[float]] = None,
         fill: _FillTypeJIT = None,
     ) -> BoundingBox:
-        output, spatial_size = self._F.rotate_bounding_box(
+        output, canvas_size = self._F.rotate_bounding_box(
             self.as_subclass(torch.Tensor),
             format=self.format,
-            spatial_size=self.spatial_size,
+            canvas_size=self.canvas_size,
             angle=angle,
             expand=expand,
             center=center,
         )
-        return BoundingBox.wrap_like(self, output, spatial_size=spatial_size)
+        return BoundingBox.wrap_like(self, output, canvas_size=canvas_size)
 
     def affine(
         self,
@@ -198,7 +198,7 @@ def affine(
         output = self._F.affine_bounding_box(
             self.as_subclass(torch.Tensor),
             self.format,
-            self.spatial_size,
+            self.canvas_size,
             angle,
             translate=translate,
             scale=scale,
@@ -218,7 +218,7 @@ def perspective(
         output = self._F.perspective_bounding_box(
             self.as_subclass(torch.Tensor),
             format=self.format,
-            spatial_size=self.spatial_size,
+            canvas_size=self.canvas_size,
             startpoints=startpoints,
             endpoints=endpoints,
             coefficients=coefficients,
@@ -232,6 +232,6 @@ def elastic(
         fill: _FillTypeJIT = None,
     ) -> BoundingBox:
         output = self._F.elastic_bounding_box(
-            self.as_subclass(torch.Tensor), self.format, self.spatial_size, displacement=displacement
+            self.as_subclass(torch.Tensor), self.format, self.canvas_size, displacement=displacement
         )
         return BoundingBox.wrap_like(self, output)
diff --git a/torchvision/datapoints/_datapoint.py b/torchvision/datapoints/_datapoint.py
index 0dabec58f25..a1b3226b38d 100644
--- a/torchvision/datapoints/_datapoint.py
+++ b/torchvision/datapoints/_datapoint.py
@@ -138,7 +138,7 @@ def __deepcopy__(self: D, memo: Dict[int, Any]) -> D:
         # *not* happen for `deepcopy(Tensor)`. A side-effect from detaching is that the `Tensor.requires_grad`
         # attribute is cleared, so we need to refill it before we return.
         # Note: We don't explicitly handle deep-copying of the metadata here. The only metadata we currently have is
-        # `BoundingBox.format` and `BoundingBox.spatial_size`, which are immutable and thus implicitly deep-copied by
+        # `BoundingBox.format` and `BoundingBox.canvas_size`, which are immutable and thus implicitly deep-copied by
         # `BoundingBox.clone()`.
         return self.detach().clone().requires_grad_(self.requires_grad)  # type: ignore[return-value]
 
diff --git a/torchvision/datapoints/_dataset_wrapper.py b/torchvision/datapoints/_dataset_wrapper.py
index d88bc81e62b..c6dade480c6 100644
--- a/torchvision/datapoints/_dataset_wrapper.py
+++ b/torchvision/datapoints/_dataset_wrapper.py
@@ -333,13 +333,13 @@ def coco_dectection_wrapper_factory(dataset, target_keys):
         default={"image_id", "boxes", "labels"},
     )
 
-    def segmentation_to_mask(segmentation, *, spatial_size):
+    def segmentation_to_mask(segmentation, *, canvas_size):
         from pycocotools import mask
 
         segmentation = (
-            mask.frPyObjects(segmentation, *spatial_size)
+            mask.frPyObjects(segmentation, *canvas_size)
             if isinstance(segmentation, dict)
-            else mask.merge(mask.frPyObjects(segmentation, *spatial_size))
+            else mask.merge(mask.frPyObjects(segmentation, *canvas_size))
         )
         return torch.from_numpy(mask.decode(segmentation))
 
@@ -351,7 +351,7 @@ def wrapper(idx, sample):
         if not target:
             return image, dict(image_id=image_id)
 
-        spatial_size = tuple(F.get_spatial_size(image))
+        canvas_size = tuple(F.get_size(image))
 
         batched_target = list_of_dicts_to_dict_of_lists(target)
         target = {}
@@ -364,7 +364,7 @@ def wrapper(idx, sample):
                 datapoints.BoundingBox(
                     batched_target["bbox"],
                     format=datapoints.BoundingBoxFormat.XYWH,
-                    spatial_size=spatial_size,
+                    canvas_size=canvas_size,
                 ),
                 new_format=datapoints.BoundingBoxFormat.XYXY,
             )
@@ -373,7 +373,7 @@ def wrapper(idx, sample):
             target["masks"] = datapoints.Mask(
                 torch.stack(
                     [
-                        segmentation_to_mask(segmentation, spatial_size=spatial_size)
+                        segmentation_to_mask(segmentation, canvas_size=canvas_size)
                         for segmentation in batched_target["segmentation"]
                     ]
                 ),
@@ -448,7 +448,7 @@ def wrapper(idx, sample):
                     for bndbox in batched_instances["bndbox"]
                 ],
                 format=datapoints.BoundingBoxFormat.XYXY,
-                spatial_size=(image.height, image.width),
+                canvas_size=(image.height, image.width),
             )
 
         if "labels" in target_keys:
@@ -485,7 +485,7 @@ def wrapper(idx, sample):
                     datapoints.BoundingBox(
                         item,
                         format=datapoints.BoundingBoxFormat.XYWH,
-                        spatial_size=(image.height, image.width),
+                        canvas_size=(image.height, image.width),
                     ),
                     new_format=datapoints.BoundingBoxFormat.XYXY,
                 ),
@@ -535,7 +535,7 @@ def wrapper(idx, sample):
             target["boxes"] = datapoints.BoundingBox(
                 batched_target["bbox"],
                 format=datapoints.BoundingBoxFormat.XYXY,
-                spatial_size=(image.height, image.width),
+                canvas_size=(image.height, image.width),
             )
 
         if "labels" in target_keys:
@@ -630,7 +630,7 @@ def wrapper(idx, sample):
         if "bbox" in target_keys:
             target["bbox"] = F.convert_format_bounding_box(
                 datapoints.BoundingBox(
-                    target["bbox"], format=datapoints.BoundingBoxFormat.XYWH, spatial_size=(image.height, image.width)
+                    target["bbox"], format=datapoints.BoundingBoxFormat.XYWH, canvas_size=(image.height, image.width)
                 ),
                 new_format=datapoints.BoundingBoxFormat.XYXY,
             )
diff --git a/torchvision/datapoints/_image.py b/torchvision/datapoints/_image.py
index e47a6c10fc3..2ebf4954d02 100644
--- a/torchvision/datapoints/_image.py
+++ b/torchvision/datapoints/_image.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, List, Optional, Union
 
 import PIL.Image
 import torch
@@ -56,14 +56,6 @@ def wrap_like(cls, other: Image, tensor: torch.Tensor) -> Image:
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
         return self._make_repr()
 
-    @property
-    def spatial_size(self) -> Tuple[int, int]:
-        return tuple(self.shape[-2:])  # type: ignore[return-value]
-
-    @property
-    def num_channels(self) -> int:
-        return self.shape[-3]
-
     def horizontal_flip(self) -> Image:
         output = self._F.horizontal_flip_image_tensor(self.as_subclass(torch.Tensor))
         return Image.wrap_like(self, output)
diff --git a/torchvision/datapoints/_mask.py b/torchvision/datapoints/_mask.py
index 0135d793d32..bc50b30583c 100644
--- a/torchvision/datapoints/_mask.py
+++ b/torchvision/datapoints/_mask.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, List, Optional, Union
 
 import PIL.Image
 import torch
@@ -51,10 +51,6 @@ def wrap_like(
     ) -> Mask:
         return cls._wrap(tensor)
 
-    @property
-    def spatial_size(self) -> Tuple[int, int]:
-        return tuple(self.shape[-2:])  # type: ignore[return-value]
-
     def horizontal_flip(self) -> Mask:
         output = self._F.horizontal_flip_mask(self.as_subclass(torch.Tensor))
         return Mask.wrap_like(self, output)
diff --git a/torchvision/datapoints/_video.py b/torchvision/datapoints/_video.py
index a6fbe2bd473..d527a68a4d1 100644
--- a/torchvision/datapoints/_video.py
+++ b/torchvision/datapoints/_video.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, List, Optional, Union
 
 import torch
 from torchvision.transforms.functional import InterpolationMode
@@ -46,18 +46,6 @@ def wrap_like(cls, other: Video, tensor: torch.Tensor) -> Video:
     def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
         return self._make_repr()
 
-    @property
-    def spatial_size(self) -> Tuple[int, int]:
-        return tuple(self.shape[-2:])  # type: ignore[return-value]
-
-    @property
-    def num_channels(self) -> int:
-        return self.shape[-3]
-
-    @property
-    def num_frames(self) -> int:
-        return self.shape[-4]
-
     def horizontal_flip(self) -> Video:
         output = self._F.horizontal_flip_video(self.as_subclass(torch.Tensor))
         return Video.wrap_like(self, output)
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index d04baf739d1..20eeaaaf55b 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -11,7 +11,7 @@
 
 from torchvision.transforms.v2._transform import _RandomApplyTransform
 from torchvision.transforms.v2.functional._geometry import _check_interpolation
-from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_spatial_size
+from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_size
 
 
 class _BaseMixupCutmix(_RandomApplyTransform):
@@ -64,7 +64,7 @@ class RandomCutmix(_BaseMixupCutmix):
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         lam = float(self._dist.sample(()))  # type: ignore[arg-type]
 
-        H, W = query_spatial_size(flat_inputs)
+        H, W = query_size(flat_inputs)
 
         r_x = torch.randint(W, ())
         r_y = torch.randint(H, ())
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 8d5cc24d25a..0b707c2ee0a 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -7,7 +7,7 @@
 from torchvision.prototype.datapoints import Label, OneHotLabel
 from torchvision.transforms.v2 import functional as F, Transform
 from torchvision.transforms.v2._utils import _setup_fill_arg, _setup_size
-from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_bounding_box, query_spatial_size
+from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_bounding_box, query_size
 
 
 class FixedSizeCrop(Transform):
@@ -46,7 +46,7 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None:
             )
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        height, width = query_spatial_size(flat_inputs)
+        height, width = query_size(flat_inputs)
         new_height = min(height, self.crop_height)
         new_width = min(width, self.crop_width)
 
@@ -67,7 +67,7 @@ def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
 
         if needs_crop and bounding_boxes is not None:
             format = bounding_boxes.format
-            bounding_boxes, spatial_size = F.crop_bounding_box(
+            bounding_boxes, canvas_size = F.crop_bounding_box(
                 bounding_boxes.as_subclass(torch.Tensor),
                 format=format,
                 top=top,
@@ -75,7 +75,7 @@ def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
                 height=new_height,
                 width=new_width,
             )
-            bounding_boxes = F.clamp_bounding_box(bounding_boxes, format=format, spatial_size=spatial_size)
+            bounding_boxes = F.clamp_bounding_box(bounding_boxes, format=format, canvas_size=canvas_size)
             height_and_width = F.convert_format_bounding_box(
                 bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYWH
             )[..., 2:]
@@ -115,7 +115,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
             elif isinstance(inpt, datapoints.BoundingBox):
                 inpt = datapoints.BoundingBox.wrap_like(
                     inpt,
-                    F.clamp_bounding_box(inpt[params["is_valid"]], format=inpt.format, spatial_size=inpt.spatial_size),
+                    F.clamp_bounding_box(inpt[params["is_valid"]], format=inpt.format, canvas_size=inpt.canvas_size),
                 )
 
         if params["needs_pad"]:
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index 34c0ced43d2..f4b8e0e1e7e 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -9,7 +9,7 @@
 from torchvision.transforms import _functional_tensor as _FT
 from torchvision.transforms.v2 import AutoAugmentPolicy, functional as F, InterpolationMode, Transform
 from torchvision.transforms.v2.functional._geometry import _check_interpolation
-from torchvision.transforms.v2.functional._meta import get_spatial_size
+from torchvision.transforms.v2.functional._meta import get_size
 
 from ._utils import _setup_fill_arg
 from .utils import check_type, is_simple_tensor
@@ -312,7 +312,7 @@ def _get_policies(
 
     def forward(self, *inputs: Any) -> Any:
         flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs)
-        height, width = get_spatial_size(image_or_video)
+        height, width = get_size(image_or_video)
 
         policy = self._policies[int(torch.randint(len(self._policies), ()))]
 
@@ -403,7 +403,7 @@ def __init__(
 
     def forward(self, *inputs: Any) -> Any:
         flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs)
-        height, width = get_spatial_size(image_or_video)
+        height, width = get_size(image_or_video)
 
         for _ in range(self.num_ops):
             transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE)
@@ -474,7 +474,7 @@ def __init__(
 
     def forward(self, *inputs: Any) -> Any:
         flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs)
-        height, width = get_spatial_size(image_or_video)
+        height, width = get_size(image_or_video)
 
         transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE)
 
@@ -568,7 +568,7 @@ def _sample_dirichlet(self, params: torch.Tensor) -> torch.Tensor:
 
     def forward(self, *inputs: Any) -> Any:
         flat_inputs_with_spec, orig_image_or_video = self._flatten_and_extract_image_or_video(inputs)
-        height, width = get_spatial_size(orig_image_or_video)
+        height, width = get_size(orig_image_or_video)
 
         if isinstance(orig_image_or_video, torch.Tensor):
             image_or_video = orig_image_or_video
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index 731d768c2a6..0fcaebc6946 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -22,7 +22,7 @@
     _setup_float_or_seq,
     _setup_size,
 )
-from .utils import has_all, has_any, is_simple_tensor, query_bounding_box, query_spatial_size
+from .utils import has_all, has_any, is_simple_tensor, query_bounding_box, query_size
 
 
 class RandomHorizontalFlip(_RandomApplyTransform):
@@ -267,7 +267,7 @@ def __init__(
         self._log_ratio = torch.log(torch.tensor(self.ratio))
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        height, width = query_spatial_size(flat_inputs)
+        height, width = query_size(flat_inputs)
         area = height * width
 
         log_ratio = self._log_ratio
@@ -558,7 +558,7 @@ def __init__(
             raise ValueError(f"Invalid canvas side range provided {side_range}.")
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        orig_h, orig_w = query_spatial_size(flat_inputs)
+        orig_h, orig_w = query_size(flat_inputs)
 
         r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0])
         canvas_width = int(orig_w * r)
@@ -735,7 +735,7 @@ def __init__(
         self.center = center
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        height, width = query_spatial_size(flat_inputs)
+        height, width = query_size(flat_inputs)
 
         angle = torch.empty(1).uniform_(self.degrees[0], self.degrees[1]).item()
         if self.translate is not None:
@@ -859,7 +859,7 @@ def __init__(
         self.padding_mode = padding_mode
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        padded_height, padded_width = query_spatial_size(flat_inputs)
+        padded_height, padded_width = query_size(flat_inputs)
 
         if self.padding is not None:
             pad_left, pad_right, pad_top, pad_bottom = self.padding
@@ -972,7 +972,7 @@ def __init__(
         self._fill = _setup_fill_arg(fill)
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        height, width = query_spatial_size(flat_inputs)
+        height, width = query_size(flat_inputs)
 
         distortion_scale = self.distortion_scale
 
@@ -1072,7 +1072,7 @@ def __init__(
         self._fill = _setup_fill_arg(fill)
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        size = list(query_spatial_size(flat_inputs))
+        size = list(query_size(flat_inputs))
 
         dx = torch.rand([1, 1] + size) * 2 - 1
         if self.sigma[0] > 0.0:
@@ -1164,7 +1164,7 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None:
             )
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        orig_h, orig_w = query_spatial_size(flat_inputs)
+        orig_h, orig_w = query_size(flat_inputs)
         bboxes = query_bounding_box(flat_inputs)
 
         while True:
@@ -1282,7 +1282,7 @@ def __init__(
         self.antialias = antialias
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        orig_height, orig_width = query_spatial_size(flat_inputs)
+        orig_height, orig_width = query_size(flat_inputs)
 
         scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0])
         r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale
@@ -1347,7 +1347,7 @@ def __init__(
         self.antialias = antialias
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        orig_height, orig_width = query_spatial_size(flat_inputs)
+        orig_height, orig_width = query_size(flat_inputs)
 
         min_size = self.min_size[int(torch.randint(len(self.min_size), ()))]
         r = min_size / min(orig_height, orig_width)
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
index b7e2a42259f..f4221476334 100644
--- a/torchvision/transforms/v2/_meta.py
+++ b/torchvision/transforms/v2/_meta.py
@@ -75,7 +75,7 @@ def _transform(
 class ClampBoundingBox(Transform):
     """[BETA] Clamp bounding boxes to their corresponding image dimensions.
 
-    The clamping is done according to the bounding boxes' ``spatial_size`` meta-data.
+    The clamping is done according to the bounding boxes' ``canvas_size`` meta-data.
 
     .. v2betastatus:: ClampBoundingBox transform
 
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 90741c4ec7d..35fdcef494f 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -384,7 +384,7 @@ def forward(self, *inputs: Any) -> Any:
         valid = (ws >= self.min_size) & (hs >= self.min_size) & (boxes >= 0).all(dim=-1)
         # TODO: Do we really need to check for out of bounds here? All
         # transforms should be clamping anyway, so this should never happen?
-        image_h, image_w = boxes.spatial_size
+        image_h, image_w = boxes.canvas_size
         valid &= (boxes[:, 0] <= image_w) & (boxes[:, 2] <= image_w)
         valid &= (boxes[:, 1] <= image_h) & (boxes[:, 3] <= image_h)
 
diff --git a/torchvision/transforms/v2/functional/__init__.py b/torchvision/transforms/v2/functional/__init__.py
index b4803f4f1b9..c9cf2a9d920 100644
--- a/torchvision/transforms/v2/functional/__init__.py
+++ b/torchvision/transforms/v2/functional/__init__.py
@@ -19,12 +19,12 @@
     get_num_channels_image_pil,
     get_num_channels_video,
     get_num_channels,
-    get_spatial_size_bounding_box,
-    get_spatial_size_image_tensor,
-    get_spatial_size_image_pil,
-    get_spatial_size_mask,
-    get_spatial_size_video,
-    get_spatial_size,
+    get_size_bounding_box,
+    get_size_image_tensor,
+    get_size_image_pil,
+    get_size_mask,
+    get_size_video,
+    get_size,
 )  # usort: skip
 
 from ._augment import erase, erase_image_pil, erase_image_tensor, erase_video
diff --git a/torchvision/transforms/v2/functional/_deprecated.py b/torchvision/transforms/v2/functional/_deprecated.py
index c9a0f647e60..f4b7b8e8565 100644
--- a/torchvision/transforms/v2/functional/_deprecated.py
+++ b/torchvision/transforms/v2/functional/_deprecated.py
@@ -19,6 +19,6 @@ def to_tensor(inpt: Any) -> torch.Tensor:
 def get_image_size(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]) -> List[int]:
     warnings.warn(
         "The function `get_image_size(...)` is deprecated and will be removed in a future release. "
-        "Instead, please use `get_spatial_size(...)` which returns `[h, w]` instead of `[w, h]`."
+        "Instead, please use `get_canvas_size(...)` which returns `[h, w]` instead of `[w, h]`."
     )
     return _F.get_image_size(inpt)
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
index e1dd2866bc5..4bbcf285fdf 100644
--- a/torchvision/transforms/v2/functional/_geometry.py
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -23,7 +23,7 @@
 
 from torchvision.utils import _log_api_usage_once
 
-from ._meta import clamp_bounding_box, convert_format_bounding_box, get_spatial_size_image_pil
+from ._meta import clamp_bounding_box, convert_format_bounding_box, get_size_image_pil
 
 from ._utils import is_simple_tensor
 
@@ -52,18 +52,18 @@ def horizontal_flip_mask(mask: torch.Tensor) -> torch.Tensor:
 
 
 def horizontal_flip_bounding_box(
-    bounding_box: torch.Tensor, format: datapoints.BoundingBoxFormat, spatial_size: Tuple[int, int]
+    bounding_box: torch.Tensor, format: datapoints.BoundingBoxFormat, canvas_size: Tuple[int, int]
 ) -> torch.Tensor:
     shape = bounding_box.shape
 
     bounding_box = bounding_box.clone().reshape(-1, 4)
 
     if format == datapoints.BoundingBoxFormat.XYXY:
-        bounding_box[:, [2, 0]] = bounding_box[:, [0, 2]].sub_(spatial_size[1]).neg_()
+        bounding_box[:, [2, 0]] = bounding_box[:, [0, 2]].sub_(canvas_size[1]).neg_()
     elif format == datapoints.BoundingBoxFormat.XYWH:
-        bounding_box[:, 0].add_(bounding_box[:, 2]).sub_(spatial_size[1]).neg_()
+        bounding_box[:, 0].add_(bounding_box[:, 2]).sub_(canvas_size[1]).neg_()
     else:  # format == datapoints.BoundingBoxFormat.CXCYWH:
-        bounding_box[:, 0].sub_(spatial_size[1]).neg_()
+        bounding_box[:, 0].sub_(canvas_size[1]).neg_()
 
     return bounding_box.reshape(shape)
 
@@ -102,18 +102,18 @@ def vertical_flip_mask(mask: torch.Tensor) -> torch.Tensor:
 
 
 def vertical_flip_bounding_box(
-    bounding_box: torch.Tensor, format: datapoints.BoundingBoxFormat, spatial_size: Tuple[int, int]
+    bounding_box: torch.Tensor, format: datapoints.BoundingBoxFormat, canvas_size: Tuple[int, int]
 ) -> torch.Tensor:
     shape = bounding_box.shape
 
     bounding_box = bounding_box.clone().reshape(-1, 4)
 
     if format == datapoints.BoundingBoxFormat.XYXY:
-        bounding_box[:, [1, 3]] = bounding_box[:, [3, 1]].sub_(spatial_size[0]).neg_()
+        bounding_box[:, [1, 3]] = bounding_box[:, [3, 1]].sub_(canvas_size[0]).neg_()
     elif format == datapoints.BoundingBoxFormat.XYWH:
-        bounding_box[:, 1].add_(bounding_box[:, 3]).sub_(spatial_size[0]).neg_()
+        bounding_box[:, 1].add_(bounding_box[:, 3]).sub_(canvas_size[0]).neg_()
     else:  # format == datapoints.BoundingBoxFormat.CXCYWH:
-        bounding_box[:, 1].sub_(spatial_size[0]).neg_()
+        bounding_box[:, 1].sub_(canvas_size[0]).neg_()
 
     return bounding_box.reshape(shape)
 
@@ -146,7 +146,7 @@ def vertical_flip(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT:
 
 
 def _compute_resized_output_size(
-    spatial_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
+    canvas_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
 ) -> List[int]:
     if isinstance(size, int):
         size = [size]
@@ -155,7 +155,7 @@ def _compute_resized_output_size(
             "max_size should only be passed if size specifies the length of the smaller edge, "
             "i.e. size should be an int or a sequence of length 1 in torchscript mode."
         )
-    return __compute_resized_output_size(spatial_size, size=size, max_size=max_size)
+    return __compute_resized_output_size(canvas_size, size=size, max_size=max_size)
 
 
 def resize_image_tensor(
@@ -275,13 +275,13 @@ def resize_mask(mask: torch.Tensor, size: List[int], max_size: Optional[int] = N
 
 
 def resize_bounding_box(
-    bounding_box: torch.Tensor, spatial_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
+    bounding_box: torch.Tensor, canvas_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
-    old_height, old_width = spatial_size
-    new_height, new_width = _compute_resized_output_size(spatial_size, size=size, max_size=max_size)
+    old_height, old_width = canvas_size
+    new_height, new_width = _compute_resized_output_size(canvas_size, size=size, max_size=max_size)
 
     if (new_height, new_width) == (old_height, old_width):
-        return bounding_box, spatial_size
+        return bounding_box, canvas_size
 
     w_ratio = new_width / old_width
     h_ratio = new_height / old_height
@@ -643,7 +643,7 @@ def affine_image_pil(
     # it is visually better to estimate the center without 0.5 offset
     # otherwise image rotated by 90 degrees is shifted vs output image of torch.rot90 or F_t.affine
     if center is None:
-        height, width = get_spatial_size_image_pil(image)
+        height, width = get_size_image_pil(image)
         center = [width * 0.5, height * 0.5]
     matrix = _get_inverse_affine_matrix(center, angle, translate, scale, shear)
 
@@ -653,7 +653,7 @@ def affine_image_pil(
 def _affine_bounding_box_with_expand(
     bounding_box: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
-    spatial_size: Tuple[int, int],
+    canvas_size: Tuple[int, int],
     angle: Union[int, float],
     translate: List[float],
     scale: float,
@@ -662,7 +662,7 @@ def _affine_bounding_box_with_expand(
     expand: bool = False,
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
     if bounding_box.numel() == 0:
-        return bounding_box, spatial_size
+        return bounding_box, canvas_size
 
     original_shape = bounding_box.shape
     original_dtype = bounding_box.dtype
@@ -680,7 +680,7 @@ def _affine_bounding_box_with_expand(
     )
 
     if center is None:
-        height, width = spatial_size
+        height, width = canvas_size
         center = [width * 0.5, height * 0.5]
 
     affine_vector = _get_inverse_affine_matrix(center, angle, translate, scale, shear, inverted=False)
@@ -710,7 +710,7 @@ def _affine_bounding_box_with_expand(
     if expand:
         # Compute minimum point for transformed image frame:
         # Points are Top-Left, Top-Right, Bottom-Left, Bottom-Right points.
-        height, width = spatial_size
+        height, width = canvas_size
         points = torch.tensor(
             [
                 [0.0, 0.0, 1.0],
@@ -728,21 +728,21 @@ def _affine_bounding_box_with_expand(
         # Estimate meta-data for image with inverted=True and with center=[0,0]
         affine_vector = _get_inverse_affine_matrix([0.0, 0.0], angle, translate, scale, shear)
         new_width, new_height = _compute_affine_output_size(affine_vector, width, height)
-        spatial_size = (new_height, new_width)
+        canvas_size = (new_height, new_width)
 
-    out_bboxes = clamp_bounding_box(out_bboxes, format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size)
+    out_bboxes = clamp_bounding_box(out_bboxes, format=datapoints.BoundingBoxFormat.XYXY, canvas_size=canvas_size)
     out_bboxes = convert_format_bounding_box(
         out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
     out_bboxes = out_bboxes.to(original_dtype)
-    return out_bboxes, spatial_size
+    return out_bboxes, canvas_size
 
 
 def affine_bounding_box(
     bounding_box: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
-    spatial_size: Tuple[int, int],
+    canvas_size: Tuple[int, int],
     angle: Union[int, float],
     translate: List[float],
     scale: float,
@@ -752,7 +752,7 @@ def affine_bounding_box(
     out_box, _ = _affine_bounding_box_with_expand(
         bounding_box,
         format=format,
-        spatial_size=spatial_size,
+        canvas_size=canvas_size,
         angle=angle,
         translate=translate,
         scale=scale,
@@ -930,7 +930,7 @@ def rotate_image_pil(
 def rotate_bounding_box(
     bounding_box: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
-    spatial_size: Tuple[int, int],
+    canvas_size: Tuple[int, int],
     angle: float,
     expand: bool = False,
     center: Optional[List[float]] = None,
@@ -941,7 +941,7 @@ def rotate_bounding_box(
     return _affine_bounding_box_with_expand(
         bounding_box,
         format=format,
-        spatial_size=spatial_size,
+        canvas_size=canvas_size,
         angle=-angle,
         translate=[0.0, 0.0],
         scale=1.0,
@@ -1168,7 +1168,7 @@ def pad_mask(
 def pad_bounding_box(
     bounding_box: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
-    spatial_size: Tuple[int, int],
+    canvas_size: Tuple[int, int],
     padding: List[int],
     padding_mode: str = "constant",
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
@@ -1184,12 +1184,12 @@ def pad_bounding_box(
         pad = [left, top, 0, 0]
     bounding_box = bounding_box + torch.tensor(pad, dtype=bounding_box.dtype, device=bounding_box.device)
 
-    height, width = spatial_size
+    height, width = canvas_size
     height += top + bottom
     width += left + right
-    spatial_size = (height, width)
+    canvas_size = (height, width)
 
-    return clamp_bounding_box(bounding_box, format=format, spatial_size=spatial_size), spatial_size
+    return clamp_bounding_box(bounding_box, format=format, canvas_size=canvas_size), canvas_size
 
 
 def pad_video(
@@ -1261,9 +1261,9 @@ def crop_bounding_box(
         sub = [left, top, 0, 0]
 
     bounding_box = bounding_box - torch.tensor(sub, dtype=bounding_box.dtype, device=bounding_box.device)
-    spatial_size = (height, width)
+    canvas_size = (height, width)
 
-    return clamp_bounding_box(bounding_box, format=format, spatial_size=spatial_size), spatial_size
+    return clamp_bounding_box(bounding_box, format=format, canvas_size=canvas_size), canvas_size
 
 
 def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
@@ -1412,7 +1412,7 @@ def perspective_image_pil(
 def perspective_bounding_box(
     bounding_box: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
-    spatial_size: Tuple[int, int],
+    canvas_size: Tuple[int, int],
     startpoints: Optional[List[List[int]]],
     endpoints: Optional[List[List[int]]],
     coefficients: Optional[List[float]] = None,
@@ -1493,7 +1493,7 @@ def perspective_bounding_box(
     out_bboxes = clamp_bounding_box(
         torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype),
         format=datapoints.BoundingBoxFormat.XYXY,
-        spatial_size=spatial_size,
+        canvas_size=canvas_size,
     )
 
     # out_bboxes should be of shape [N boxes, 4]
@@ -1651,7 +1651,7 @@ def _create_identity_grid(size: Tuple[int, int], device: torch.device, dtype: to
 def elastic_bounding_box(
     bounding_box: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
-    spatial_size: Tuple[int, int],
+    canvas_size: Tuple[int, int],
     displacement: torch.Tensor,
 ) -> torch.Tensor:
     if bounding_box.numel() == 0:
@@ -1670,7 +1670,7 @@ def elastic_bounding_box(
         convert_format_bounding_box(bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
     ).reshape(-1, 4)
 
-    id_grid = _create_identity_grid(spatial_size, device=device, dtype=dtype)
+    id_grid = _create_identity_grid(canvas_size, device=device, dtype=dtype)
     # We construct an approximation of inverse grid as inv_grid = id_grid - displacement
     # This is not an exact inverse of the grid
     inv_grid = id_grid.sub_(displacement)
@@ -1683,7 +1683,7 @@ def elastic_bounding_box(
     index_x, index_y = index_xy[:, 0], index_xy[:, 1]
 
     # Transform points:
-    t_size = torch.tensor(spatial_size[::-1], device=displacement.device, dtype=displacement.dtype)
+    t_size = torch.tensor(canvas_size[::-1], device=displacement.device, dtype=displacement.dtype)
     transformed_points = inv_grid[0, index_y, index_x, :].add_(1).mul_(0.5 * t_size).sub_(0.5)
 
     transformed_points = transformed_points.reshape(-1, 4, 2)
@@ -1691,7 +1691,7 @@ def elastic_bounding_box(
     out_bboxes = clamp_bounding_box(
         torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype),
         format=datapoints.BoundingBoxFormat.XYXY,
-        spatial_size=spatial_size,
+        canvas_size=canvas_size,
     )
 
     return convert_format_bounding_box(
@@ -1804,13 +1804,13 @@ def center_crop_image_tensor(image: torch.Tensor, output_size: List[int]) -> tor
 @torch.jit.unused
 def center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PIL.Image.Image:
     crop_height, crop_width = _center_crop_parse_output_size(output_size)
-    image_height, image_width = get_spatial_size_image_pil(image)
+    image_height, image_width = get_size_image_pil(image)
 
     if crop_height > image_height or crop_width > image_width:
         padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
         image = pad_image_pil(image, padding_ltrb, fill=0)
 
-        image_height, image_width = get_spatial_size_image_pil(image)
+        image_height, image_width = get_size_image_pil(image)
         if crop_width == image_width and crop_height == image_height:
             return image
 
@@ -1821,11 +1821,11 @@ def center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PIL
 def center_crop_bounding_box(
     bounding_box: torch.Tensor,
     format: datapoints.BoundingBoxFormat,
-    spatial_size: Tuple[int, int],
+    canvas_size: Tuple[int, int],
     output_size: List[int],
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
     crop_height, crop_width = _center_crop_parse_output_size(output_size)
-    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *spatial_size)
+    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *canvas_size)
     return crop_bounding_box(bounding_box, format, top=crop_top, left=crop_left, height=crop_height, width=crop_width)
 
 
@@ -1903,7 +1903,7 @@ def resized_crop_bounding_box(
     size: List[int],
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
     bounding_box, _ = crop_bounding_box(bounding_box, format, top, left, height, width)
-    return resize_bounding_box(bounding_box, spatial_size=(height, width), size=size)
+    return resize_bounding_box(bounding_box, canvas_size=(height, width), size=size)
 
 
 def resized_crop_mask(
@@ -1998,7 +1998,7 @@ def five_crop_image_pil(
     image: PIL.Image.Image, size: List[int]
 ) -> Tuple[PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image]:
     crop_height, crop_width = _parse_five_crop_size(size)
-    image_height, image_width = get_spatial_size_image_pil(image)
+    image_height, image_width = get_size_image_pil(image)
 
     if crop_width > image_width or crop_height > image_height:
         raise ValueError(f"Requested crop size {size} is bigger than input size {(image_height, image_width)}")
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
index 8ffa3966195..db0bd753cf3 100644
--- a/torchvision/transforms/v2/functional/_meta.py
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -27,23 +27,29 @@ def get_dimensions_image_tensor(image: torch.Tensor) -> List[int]:
 get_dimensions_image_pil = _FP.get_dimensions
 
 
+def get_dimensions_video(video: torch.Tensor) -> List[int]:
+    return get_dimensions_image_tensor(video)
+
+
 def get_dimensions(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]) -> List[int]:
     if not torch.jit.is_scripting():
         _log_api_usage_once(get_dimensions)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return get_dimensions_image_tensor(inpt)
-    elif isinstance(inpt, (datapoints.Image, datapoints.Video)):
-        channels = inpt.num_channels
-        height, width = inpt.spatial_size
-        return [channels, height, width]
-    elif isinstance(inpt, PIL.Image.Image):
-        return get_dimensions_image_pil(inpt)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+
+    for typ, get_size_fn in {
+        datapoints.Image: get_dimensions_image_tensor,
+        datapoints.Video: get_dimensions_video,
+        PIL.Image.Image: get_dimensions_image_pil,
+    }.items():
+        if isinstance(inpt, typ):
+            return get_size_fn(inpt)
+
+    raise TypeError(
+        f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, "
+        f"but got {type(inpt)} instead."
+    )
 
 
 def get_num_channels_image_tensor(image: torch.Tensor) -> int:
@@ -70,15 +76,19 @@ def get_num_channels(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoType
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return get_num_channels_image_tensor(inpt)
-    elif isinstance(inpt, (datapoints.Image, datapoints.Video)):
-        return inpt.num_channels
-    elif isinstance(inpt, PIL.Image.Image):
-        return get_num_channels_image_pil(inpt)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+
+    for typ, get_size_fn in {
+        datapoints.Image: get_num_channels_image_tensor,
+        datapoints.Video: get_num_channels_video,
+        PIL.Image.Image: get_num_channels_image_pil,
+    }.items():
+        if isinstance(inpt, typ):
+            return get_size_fn(inpt)
+
+    raise TypeError(
+        f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, "
+        f"but got {type(inpt)} instead."
+    )
 
 
 # We changed the names to ensure it can be used not only for images but also videos. Thus, we just alias it without
@@ -86,7 +96,7 @@ def get_num_channels(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoType
 get_image_num_channels = get_num_channels
 
 
-def get_spatial_size_image_tensor(image: torch.Tensor) -> List[int]:
+def get_size_image_tensor(image: torch.Tensor) -> List[int]:
     hw = list(image.shape[-2:])
     ndims = len(hw)
     if ndims == 2:
@@ -96,39 +106,45 @@ def get_spatial_size_image_tensor(image: torch.Tensor) -> List[int]:
 
 
 @torch.jit.unused
-def get_spatial_size_image_pil(image: PIL.Image.Image) -> List[int]:
+def get_size_image_pil(image: PIL.Image.Image) -> List[int]:
     width, height = _FP.get_image_size(image)
     return [height, width]
 
 
-def get_spatial_size_video(video: torch.Tensor) -> List[int]:
-    return get_spatial_size_image_tensor(video)
+def get_size_video(video: torch.Tensor) -> List[int]:
+    return get_size_image_tensor(video)
 
 
-def get_spatial_size_mask(mask: torch.Tensor) -> List[int]:
-    return get_spatial_size_image_tensor(mask)
+def get_size_mask(mask: torch.Tensor) -> List[int]:
+    return get_size_image_tensor(mask)
 
 
 @torch.jit.unused
-def get_spatial_size_bounding_box(bounding_box: datapoints.BoundingBox) -> List[int]:
-    return list(bounding_box.spatial_size)
+def get_size_bounding_box(bounding_box: datapoints.BoundingBox) -> List[int]:
+    return list(bounding_box.canvas_size)
 
 
-def get_spatial_size(inpt: datapoints._InputTypeJIT) -> List[int]:
+def get_size(inpt: datapoints._InputTypeJIT) -> List[int]:
     if not torch.jit.is_scripting():
-        _log_api_usage_once(get_spatial_size)
+        _log_api_usage_once(get_size)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        return get_spatial_size_image_tensor(inpt)
-    elif isinstance(inpt, (datapoints.Image, datapoints.Video, datapoints.BoundingBox, datapoints.Mask)):
-        return list(inpt.spatial_size)
-    elif isinstance(inpt, PIL.Image.Image):
-        return get_spatial_size_image_pil(inpt)
-    else:
-        raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
-            f"but got {type(inpt)} instead."
-        )
+        return get_size_image_tensor(inpt)
+
+    for typ, get_size_fn in {
+        datapoints.Image: get_size_image_tensor,
+        datapoints.BoundingBox: get_size_bounding_box,
+        datapoints.Mask: get_size_mask,
+        datapoints.Video: get_size_video,
+        PIL.Image.Image: get_size_image_pil,
+    }.items():
+        if isinstance(inpt, typ):
+            return get_size_fn(inpt)
+
+    raise TypeError(
+        f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
+        f"but got {type(inpt)} instead."
+    )
 
 
 def get_num_frames_video(video: torch.Tensor) -> int:
@@ -142,7 +158,7 @@ def get_num_frames(inpt: datapoints._VideoTypeJIT) -> int:
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
         return get_num_frames_video(inpt)
     elif isinstance(inpt, datapoints.Video):
-        return inpt.num_frames
+        return get_num_frames_video(inpt)
     else:
         raise TypeError(f"Input can either be a plain tensor or a `Video` datapoint, but got {type(inpt)} instead.")
 
@@ -241,7 +257,7 @@ def convert_format_bounding_box(
 
 
 def _clamp_bounding_box(
-    bounding_box: torch.Tensor, format: BoundingBoxFormat, spatial_size: Tuple[int, int]
+    bounding_box: torch.Tensor, format: BoundingBoxFormat, canvas_size: Tuple[int, int]
 ) -> torch.Tensor:
     # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
     #  BoundingBoxFormat instead of converting back and forth
@@ -250,8 +266,8 @@ def _clamp_bounding_box(
     xyxy_boxes = convert_format_bounding_box(
         bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
     )
-    xyxy_boxes[..., 0::2].clamp_(min=0, max=spatial_size[1])
-    xyxy_boxes[..., 1::2].clamp_(min=0, max=spatial_size[0])
+    xyxy_boxes[..., 0::2].clamp_(min=0, max=canvas_size[1])
+    xyxy_boxes[..., 1::2].clamp_(min=0, max=canvas_size[0])
     out_boxes = convert_format_bounding_box(
         xyxy_boxes, old_format=BoundingBoxFormat.XYXY, new_format=format, inplace=True
     )
@@ -261,19 +277,19 @@ def _clamp_bounding_box(
 def clamp_bounding_box(
     inpt: datapoints._InputTypeJIT,
     format: Optional[BoundingBoxFormat] = None,
-    spatial_size: Optional[Tuple[int, int]] = None,
+    canvas_size: Optional[Tuple[int, int]] = None,
 ) -> datapoints._InputTypeJIT:
     if not torch.jit.is_scripting():
         _log_api_usage_once(clamp_bounding_box)
 
     if torch.jit.is_scripting() or is_simple_tensor(inpt):
-        if format is None or spatial_size is None:
-            raise ValueError("For simple tensor inputs, `format` and `spatial_size` has to be passed.")
-        return _clamp_bounding_box(inpt, format=format, spatial_size=spatial_size)
+        if format is None or canvas_size is None:
+            raise ValueError("For simple tensor inputs, `format` and `canvas_size` has to be passed.")
+        return _clamp_bounding_box(inpt, format=format, canvas_size=canvas_size)
     elif isinstance(inpt, datapoints.BoundingBox):
-        if format is not None or spatial_size is not None:
-            raise ValueError("For bounding box datapoint inputs, `format` and `spatial_size` must not be passed.")
-        output = _clamp_bounding_box(inpt.as_subclass(torch.Tensor), format=inpt.format, spatial_size=inpt.spatial_size)
+        if format is not None or canvas_size is not None:
+            raise ValueError("For bounding box datapoint inputs, `format` and `canvas_size` must not be passed.")
+        output = _clamp_bounding_box(inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size)
         return datapoints.BoundingBox.wrap_like(inpt, output)
     else:
         raise TypeError(
diff --git a/torchvision/transforms/v2/utils.py b/torchvision/transforms/v2/utils.py
index c4cf481bcd2..d28434cfd72 100644
--- a/torchvision/transforms/v2/utils.py
+++ b/torchvision/transforms/v2/utils.py
@@ -6,7 +6,7 @@
 from torchvision import datapoints
 
 from torchvision._utils import sequence_to_str
-from torchvision.transforms.v2.functional import get_dimensions, get_spatial_size, is_simple_tensor
+from torchvision.transforms.v2.functional import get_dimensions, get_size, is_simple_tensor
 
 
 def query_bounding_box(flat_inputs: List[Any]) -> datapoints.BoundingBox:
@@ -22,7 +22,7 @@ def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]:
     chws = {
         tuple(get_dimensions(inpt))
         for inpt in flat_inputs
-        if isinstance(inpt, (datapoints.Image, PIL.Image.Image, datapoints.Video)) or is_simple_tensor(inpt)
+        if check_type(inpt, (is_simple_tensor, datapoints.Image, PIL.Image.Image, datapoints.Video))
     }
     if not chws:
         raise TypeError("No image or video was found in the sample")
@@ -32,14 +32,21 @@ def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]:
     return c, h, w
 
 
-def query_spatial_size(flat_inputs: List[Any]) -> Tuple[int, int]:
+def query_size(flat_inputs: List[Any]) -> Tuple[int, int]:
     sizes = {
-        tuple(get_spatial_size(inpt))
+        tuple(get_size(inpt))
         for inpt in flat_inputs
-        if isinstance(
-            inpt, (datapoints.Image, PIL.Image.Image, datapoints.Video, datapoints.Mask, datapoints.BoundingBox)
+        if check_type(
+            inpt,
+            (
+                is_simple_tensor,
+                datapoints.Image,
+                PIL.Image.Image,
+                datapoints.Video,
+                datapoints.Mask,
+                datapoints.BoundingBox,
+            ),
         )
-        or is_simple_tensor(inpt)
     }
     if not sizes:
         raise TypeError("No image, video, mask or bounding box was found in the sample")

From b629b9db957f17316e627424e3434dd68af548cf Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 17 Jul 2023 10:46:59 +0200
Subject: [PATCH 2/2] fix prototype tests

---
 test/test_prototype_transforms.py | 44 ++++++++++++++-----------------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 192d5ebacd0..15d649b35be 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -210,13 +210,13 @@ class TestFixedSizeCrop:
     def test__get_params(self, mocker):
         crop_size = (7, 7)
         batch_shape = (10,)
-        spatial_size = (11, 5)
+        canvas_size = (11, 5)
 
         transform = transforms.FixedSizeCrop(size=crop_size)
 
         flat_inputs = [
-            make_image(size=spatial_size, color_space="RGB"),
-            make_bounding_box(format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=batch_shape),
+            make_image(canvas_size, color_space="RGB"),
+            make_bounding_box(canvas_size, format=BoundingBoxFormat.XYXY, batch_dims=batch_shape),
         ]
         params = transform._get_params(flat_inputs)
 
@@ -295,7 +295,7 @@ def test__transform(self, mocker, needs):
 
     def test__transform_culling(self, mocker):
         batch_size = 10
-        spatial_size = (10, 10)
+        canvas_size = (10, 10)
 
         is_valid = torch.randint(0, 2, (batch_size,), dtype=torch.bool)
         mocker.patch(
@@ -304,17 +304,15 @@ def test__transform_culling(self, mocker):
                 needs_crop=True,
                 top=0,
                 left=0,
-                height=spatial_size[0],
-                width=spatial_size[1],
+                height=canvas_size[0],
+                width=canvas_size[1],
                 is_valid=is_valid,
                 needs_pad=False,
             ),
         )
 
-        bounding_boxes = make_bounding_box(
-            format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(batch_size,)
-        )
-        masks = make_detection_mask(size=spatial_size, batch_dims=(batch_size,))
+        bounding_boxes = make_bounding_box(canvas_size, format=BoundingBoxFormat.XYXY, batch_dims=(batch_size,))
+        masks = make_detection_mask(canvas_size, batch_dims=(batch_size,))
         labels = make_label(extra_dims=(batch_size,))
 
         transform = transforms.FixedSizeCrop((-1, -1))
@@ -334,7 +332,7 @@ def test__transform_culling(self, mocker):
 
     def test__transform_bounding_box_clamping(self, mocker):
         batch_size = 3
-        spatial_size = (10, 10)
+        canvas_size = (10, 10)
 
         mocker.patch(
             "torchvision.prototype.transforms._geometry.FixedSizeCrop._get_params",
@@ -342,16 +340,14 @@ def test__transform_bounding_box_clamping(self, mocker):
                 needs_crop=True,
                 top=0,
                 left=0,
-                height=spatial_size[0],
-                width=spatial_size[1],
+                height=canvas_size[0],
+                width=canvas_size[1],
                 is_valid=torch.full((batch_size,), fill_value=True),
                 needs_pad=False,
             ),
         )
 
-        bounding_box = make_bounding_box(
-            format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(batch_size,)
-        )
+        bounding_box = make_bounding_box(canvas_size, format=BoundingBoxFormat.XYXY, batch_dims=(batch_size,))
         mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_box")
 
         transform = transforms.FixedSizeCrop((-1, -1))
@@ -496,27 +492,27 @@ def make_datapoints():
 
         pil_image = to_image_pil(make_image(size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
-            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
+            "masks": make_detection_mask(size, num_objects=num_objects, dtype=torch.long),
         }
 
         yield (pil_image, target)
 
-        tensor_image = torch.Tensor(make_image(size=size, color_space="RGB"))
+        tensor_image = torch.Tensor(make_image(size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
-            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
+            "masks": make_detection_mask(size, num_objects=num_objects, dtype=torch.long),
         }
 
         yield (tensor_image, target)
 
-        datapoint_image = make_image(size=size, color_space="RGB")
+        datapoint_image = make_image(size, color_space="RGB")
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
-            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
+            "masks": make_detection_mask(size, num_objects=num_objects, dtype=torch.long),
         }
 
         yield (datapoint_image, target)