diff --git a/torchvision/datapoints/_dataset_wrapper.py b/torchvision/datapoints/_dataset_wrapper.py
index cce8f1b2e84..09a5469dde1 100644
--- a/torchvision/datapoints/_dataset_wrapper.py
+++ b/torchvision/datapoints/_dataset_wrapper.py
@@ -38,8 +38,8 @@ def wrap_dataset_for_transforms_v2(dataset, target_keys=None):
         * :class:`~torchvision.datasets.CocoDetection`: Instead of returning the target as list of dicts, the wrapper
           returns a dict of lists. In addition, the key-value-pairs ``"boxes"`` (in ``XYXY`` coordinate format),
           ``"masks"`` and ``"labels"`` are added and wrap the data in the corresponding ``torchvision.datapoints``.
-          The original keys are preserved. If ``target_keys`` is ommitted, returns only the values for the ``"boxes"``
-          and ``"labels"``.
+          The original keys are preserved. If ``target_keys`` is ommitted, returns only the values for the
+          ``"image_id"``, ``"boxes"``, and ``"labels"``.
         * :class:`~torchvision.datasets.VOCDetection`: The key-value-pairs ``"boxes"`` and ``"labels"`` are added to
           the target and wrap the data in the corresponding ``torchvision.datapoints``. The original keys are
           preserved. If ``target_keys`` is ommitted, returns only the values for the ``"boxes"`` and ``"labels"``.
@@ -328,7 +328,7 @@ def coco_dectection_wrapper_factory(dataset, target_keys):
             "masks",
             "labels",
         },
-        default={"boxes", "labels"},
+        default={"image_id", "boxes", "labels"},
     )
 
     def segmentation_to_mask(segmentation, *, spatial_size):