diff --git a/.github/workflows/full.yml b/.github/workflows/full.yml
index e587e26aa1b8..e42380de5ead 100644
--- a/.github/workflows/full.yml
+++ b/.github/workflows/full.yml
@@ -156,7 +156,7 @@ jobs:
       - name: Install SDK
         run: |
           pip3 install -r ./tests/python/requirements.txt \
-            -e './cvat-sdk[pytorch]' -e ./cvat-cli \
+            -e './cvat-sdk[masks,pytorch]' -e ./cvat-cli \
             --extra-index-url https://download.pytorch.org/whl/cpu
 
       - name: Running REST API and SDK tests
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index f4e3f11d1052..becca0218f94 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -166,7 +166,7 @@ jobs:
       - name: Install SDK
         run: |
           pip3 install -r ./tests/python/requirements.txt \
-            -e './cvat-sdk[pytorch]' -e ./cvat-cli \
+            -e './cvat-sdk[masks,pytorch]' -e ./cvat-cli \
             --extra-index-url https://download.pytorch.org/whl/cpu
 
       - name: Run REST API and SDK tests
diff --git a/changelog.d/20241120_143739_roman_aa_masks.md b/changelog.d/20241120_143739_roman_aa_masks.md
new file mode 100644
index 000000000000..97422dfe6060
--- /dev/null
+++ b/changelog.d/20241120_143739_roman_aa_masks.md
@@ -0,0 +1,13 @@
+### Added
+
+- \[SDK\] Added new auto-annotation helpers (`mask`, `polygon`, `encode_mask`)
+  to support AA functions that return masks or polygons
+  (<https://github.com/cvat-ai/cvat/pull/8724>)
+
+- \[SDK\] Added a new built-in auto-annotation function,
+  `torchvision_instance_segmentation`
+  (<https://github.com/cvat-ai/cvat/pull/8724>)
+
+- \[SDK, CLI\] Added a new auto-annotation parameter, `conv_mask_to_poly`
+  (`--conv-mask-to-poly` in the CLI)
+  (<https://github.com/cvat-ai/cvat/pull/8724>)
diff --git a/cvat-cli/src/cvat_cli/_internal/commands.py b/cvat-cli/src/cvat_cli/_internal/commands.py
index f49416c843e5..324d427a64b8 100644
--- a/cvat-cli/src/cvat_cli/_internal/commands.py
+++ b/cvat-cli/src/cvat_cli/_internal/commands.py
@@ -476,6 +476,12 @@ def configure_parser(self, parser: argparse.ArgumentParser) -> None:
             default=None,
         )
 
+        parser.add_argument(
+            "--conv-mask-to-poly",
+            action="store_true",
+            help="Convert mask shapes to polygon shapes",
+        )
+
     def execute(
         self,
         client: Client,
@@ -487,6 +493,7 @@ def execute(
         clear_existing: bool = False,
         allow_unmatched_labels: bool = False,
         conf_threshold: Optional[float],
+        conv_mask_to_poly: bool,
     ) -> None:
         if function_module is not None:
             function = importlib.import_module(function_module)
@@ -512,4 +519,5 @@ def execute(
             clear_existing=clear_existing,
             allow_unmatched_labels=allow_unmatched_labels,
             conf_threshold=conf_threshold,
+            conv_mask_to_poly=conv_mask_to_poly,
         )
diff --git a/cvat-sdk/README.md b/cvat-sdk/README.md
index fa68c0e5d40d..89702c02abd4 100644
--- a/cvat-sdk/README.md
+++ b/cvat-sdk/README.md
@@ -20,7 +20,14 @@ To install a prebuilt package, run the following command in the terminal:
 pip install cvat-sdk
 ```
 
-To use the PyTorch adapter, request the `pytorch` extra:
+To use the `cvat_sdk.masks` module, request the `masks` extra:
+
+```bash
+pip install "cvat-sdk[masks]"
+```
+
+To use the PyTorch adapter or the built-in PyTorch-based auto-annotation functions,
+request the `pytorch` extra:
 
 ```bash
 pip install "cvat-sdk[pytorch]"
diff --git a/cvat-sdk/cvat_sdk/auto_annotation/__init__.py b/cvat-sdk/cvat_sdk/auto_annotation/__init__.py
index e5dbdf9fcc42..adbb6007e125 100644
--- a/cvat-sdk/cvat_sdk/auto_annotation/__init__.py
+++ b/cvat-sdk/cvat_sdk/auto_annotation/__init__.py
@@ -10,8 +10,27 @@
     keypoint,
     keypoint_spec,
     label_spec,
+    mask,
+    polygon,
     rectangle,
     shape,
     skeleton,
     skeleton_label_spec,
 )
+
+__all__ = [
+    "annotate_task",
+    "BadFunctionError",
+    "DetectionFunction",
+    "DetectionFunctionContext",
+    "DetectionFunctionSpec",
+    "keypoint_spec",
+    "keypoint",
+    "label_spec",
+    "mask",
+    "polygon",
+    "rectangle",
+    "shape",
+    "skeleton_label_spec",
+    "skeleton",
+]
diff --git a/cvat-sdk/cvat_sdk/auto_annotation/driver.py b/cvat-sdk/cvat_sdk/auto_annotation/driver.py
index 175b96ab29b2..5ffdb36f5bee 100644
--- a/cvat-sdk/cvat_sdk/auto_annotation/driver.py
+++ b/cvat-sdk/cvat_sdk/auto_annotation/driver.py
@@ -99,9 +99,11 @@ def __init__(
         ds_labels: Sequence[models.ILabel],
         *,
         allow_unmatched_labels: bool,
+        conv_mask_to_poly: bool,
     ) -> None:
         self._logger = logger
         self._allow_unmatched_labels = allow_unmatched_labels
+        self._conv_mask_to_poly = conv_mask_to_poly
 
         ds_labels_by_name = {ds_label.name: ds_label for ds_label in ds_labels}
 
@@ -217,6 +219,11 @@ def validate_and_remap(self, shapes: list[models.LabeledShapeRequest], ds_frame:
                 if getattr(shape, "elements", None):
                     raise BadFunctionError("function output non-skeleton shape with elements")
 
+                if shape.type.value == "mask" and self._conv_mask_to_poly:
+                    raise BadFunctionError(
+                        "function output mask shape despite conv_mask_to_poly=True"
+                    )
+
         shapes[:] = new_shapes
 
 
@@ -224,6 +231,7 @@ def validate_and_remap(self, shapes: list[models.LabeledShapeRequest], ds_frame:
 class _DetectionFunctionContextImpl(DetectionFunctionContext):
     frame_name: str
     conf_threshold: Optional[float] = None
+    conv_mask_to_poly: bool = False
 
 
 def annotate_task(
@@ -235,6 +243,7 @@ def annotate_task(
     clear_existing: bool = False,
     allow_unmatched_labels: bool = False,
     conf_threshold: Optional[float] = None,
+    conv_mask_to_poly: bool = False,
 ) -> None:
     """
     Downloads data for the task with the given ID, applies the given function to it
@@ -268,7 +277,11 @@ def annotate_task(
     function that refer to this label are ignored. Otherwise, BadFunctionError is raised.
 
     The conf_threshold parameter must be None or a number between 0 and 1. It will be passed
-    to the function as the conf_threshold attribute of the context object.
+    to the AA function as the conf_threshold attribute of the context object.
+
+    The conv_mask_to_poly parameter will be passed to the AA function as the conv_mask_to_poly
+    attribute of the context object. If it's true, and the AA function returns any mask shapes,
+    BadFunctionError will be raised.
     """
 
     if pbar is None:
@@ -286,6 +299,7 @@ def annotate_task(
         function.spec.labels,
         dataset.labels,
         allow_unmatched_labels=allow_unmatched_labels,
+        conv_mask_to_poly=conv_mask_to_poly,
     )
 
     shapes = []
@@ -294,7 +308,9 @@ def annotate_task(
         for sample in pbar.iter(dataset.samples):
             frame_shapes = function.detect(
                 _DetectionFunctionContextImpl(
-                    frame_name=sample.frame_name, conf_threshold=conf_threshold
+                    frame_name=sample.frame_name,
+                    conf_threshold=conf_threshold,
+                    conv_mask_to_poly=conv_mask_to_poly,
                 ),
                 sample.media.load_image(),
             )
diff --git a/cvat-sdk/cvat_sdk/auto_annotation/functions/_torchvision.py b/cvat-sdk/cvat_sdk/auto_annotation/functions/_torchvision.py
new file mode 100644
index 000000000000..9fa88e0a7c07
--- /dev/null
+++ b/cvat-sdk/cvat_sdk/auto_annotation/functions/_torchvision.py
@@ -0,0 +1,26 @@
+# Copyright (C) 2024 CVAT.ai Corporation
+#
+# SPDX-License-Identifier: MIT
+
+from functools import cached_property
+
+import torchvision.models
+
+import cvat_sdk.auto_annotation as cvataa
+
+
+class TorchvisionFunction:
+    def __init__(self, model_name: str, weights_name: str = "DEFAULT", **kwargs) -> None:
+        weights_enum = torchvision.models.get_model_weights(model_name)
+        self._weights = weights_enum[weights_name]
+        self._transforms = self._weights.transforms()
+        self._model = torchvision.models.get_model(model_name, weights=self._weights, **kwargs)
+        self._model.eval()
+
+    @cached_property
+    def spec(self) -> cvataa.DetectionFunctionSpec:
+        return cvataa.DetectionFunctionSpec(
+            labels=[
+                cvataa.label_spec(cat, i) for i, cat in enumerate(self._weights.meta["categories"])
+            ]
+        )
diff --git a/cvat-sdk/cvat_sdk/auto_annotation/functions/torchvision_detection.py b/cvat-sdk/cvat_sdk/auto_annotation/functions/torchvision_detection.py
index 423db05adbcb..b16e4d8874ae 100644
--- a/cvat-sdk/cvat_sdk/auto_annotation/functions/torchvision_detection.py
+++ b/cvat-sdk/cvat_sdk/auto_annotation/functions/torchvision_detection.py
@@ -2,31 +2,15 @@
 #
 # SPDX-License-Identifier: MIT
 
-from functools import cached_property
-
 import PIL.Image
-import torchvision.models
 
 import cvat_sdk.auto_annotation as cvataa
 import cvat_sdk.models as models
 
+from ._torchvision import TorchvisionFunction
 
-class _TorchvisionDetectionFunction:
-    def __init__(self, model_name: str, weights_name: str = "DEFAULT", **kwargs) -> None:
-        weights_enum = torchvision.models.get_model_weights(model_name)
-        self._weights = weights_enum[weights_name]
-        self._transforms = self._weights.transforms()
-        self._model = torchvision.models.get_model(model_name, weights=self._weights, **kwargs)
-        self._model.eval()
-
-    @cached_property
-    def spec(self) -> cvataa.DetectionFunctionSpec:
-        return cvataa.DetectionFunctionSpec(
-            labels=[
-                cvataa.label_spec(cat, i) for i, cat in enumerate(self._weights.meta["categories"])
-            ]
-        )
 
+class _TorchvisionDetectionFunction(TorchvisionFunction):
     def detect(
         self, context: cvataa.DetectionFunctionContext, image: PIL.Image.Image
     ) -> list[models.LabeledShapeRequest]:
diff --git a/cvat-sdk/cvat_sdk/auto_annotation/functions/torchvision_instance_segmentation.py b/cvat-sdk/cvat_sdk/auto_annotation/functions/torchvision_instance_segmentation.py
new file mode 100644
index 000000000000..6aa891811f5b
--- /dev/null
+++ b/cvat-sdk/cvat_sdk/auto_annotation/functions/torchvision_instance_segmentation.py
@@ -0,0 +1,70 @@
+# Copyright (C) 2024 CVAT.ai Corporation
+#
+# SPDX-License-Identifier: MIT
+
+import math
+from collections.abc import Iterator
+
+import numpy as np
+import PIL.Image
+from skimage import measure
+from torch import Tensor
+
+import cvat_sdk.auto_annotation as cvataa
+import cvat_sdk.models as models
+from cvat_sdk.masks import encode_mask
+
+from ._torchvision import TorchvisionFunction
+
+
+def _is_positively_oriented(contour: np.ndarray) -> bool:
+    ys, xs = contour.T
+
+    # This is the shoelace formula, except we only need the sign of the result,
+    # so we compare instead of subtracting. Compared to the typical formula,
+    # the sign is inverted, because the Y axis points downwards.
+    return np.sum(xs * np.roll(ys, -1)) < np.sum(ys * np.roll(xs, -1))
+
+
+def _generate_shapes(
+    context: cvataa.DetectionFunctionContext, box: Tensor, mask: Tensor, label: Tensor
+) -> Iterator[models.LabeledShapeRequest]:
+    LEVEL = 0.5
+
+    if context.conv_mask_to_poly:
+        # Since we treat mask values of exactly LEVEL as true, we'd like them
+        # to also be considered high by find_contours. And for that, the level
+        # parameter must be slightly less than LEVEL.
+        contours = measure.find_contours(mask[0].detach().numpy(), level=math.nextafter(LEVEL, 0))
+
+        for contour in contours:
+            if len(contour) < 3 or _is_positively_oriented(contour):
+                continue
+
+            contour = measure.approximate_polygon(contour, tolerance=2.5)
+
+            yield cvataa.polygon(label.item(), contour[:, ::-1].ravel().tolist())
+
+    else:
+        yield cvataa.mask(label.item(), encode_mask(mask[0] >= LEVEL, box.tolist()))
+
+
+class _TorchvisionInstanceSegmentationFunction(TorchvisionFunction):
+    def detect(
+        self, context: cvataa.DetectionFunctionContext, image: PIL.Image.Image
+    ) -> list[models.LabeledShapeRequest]:
+        conf_threshold = context.conf_threshold or 0
+        results = self._model([self._transforms(image)])
+
+        return [
+            shape
+            for result in results
+            for box, mask, label, score in zip(
+                result["boxes"], result["masks"], result["labels"], result["scores"]
+            )
+            if score >= conf_threshold
+            for shape in _generate_shapes(context, box, mask, label)
+        ]
+
+
+create = _TorchvisionInstanceSegmentationFunction
diff --git a/cvat-sdk/cvat_sdk/auto_annotation/functions/torchvision_keypoint_detection.py b/cvat-sdk/cvat_sdk/auto_annotation/functions/torchvision_keypoint_detection.py
index 0756b0b1738c..4d2250d61c35 100644
--- a/cvat-sdk/cvat_sdk/auto_annotation/functions/torchvision_keypoint_detection.py
+++ b/cvat-sdk/cvat_sdk/auto_annotation/functions/torchvision_keypoint_detection.py
@@ -5,20 +5,14 @@
 from functools import cached_property
 
 import PIL.Image
-import torchvision.models
 
 import cvat_sdk.auto_annotation as cvataa
 import cvat_sdk.models as models
 
+from ._torchvision import TorchvisionFunction
 
-class _TorchvisionKeypointDetectionFunction:
-    def __init__(self, model_name: str, weights_name: str = "DEFAULT", **kwargs) -> None:
-        weights_enum = torchvision.models.get_model_weights(model_name)
-        self._weights = weights_enum[weights_name]
-        self._transforms = self._weights.transforms()
-        self._model = torchvision.models.get_model(model_name, weights=self._weights, **kwargs)
-        self._model.eval()
 
+class _TorchvisionKeypointDetectionFunction(TorchvisionFunction):
     @cached_property
     def spec(self) -> cvataa.DetectionFunctionSpec:
         return cvataa.DetectionFunctionSpec(
diff --git a/cvat-sdk/cvat_sdk/auto_annotation/interface.py b/cvat-sdk/cvat_sdk/auto_annotation/interface.py
index 47e944a1de84..f95cb50b4f2d 100644
--- a/cvat-sdk/cvat_sdk/auto_annotation/interface.py
+++ b/cvat-sdk/cvat_sdk/auto_annotation/interface.py
@@ -68,6 +68,16 @@ def conf_threshold(self) -> Optional[float]:
         If the function is not able to estimate confidence levels, it can ignore this value.
         """
 
+    @property
+    @abc.abstractmethod
+    def conv_mask_to_poly(self) -> bool:
+        """
+        If this is true, the function must convert any mask shapes to polygon shapes
+        before returning them.
+
+        If the function does not return any mask shapes, then it can ignore this value.
+        """
+
 
 class DetectionFunction(Protocol):
     """
@@ -168,6 +178,21 @@ def rectangle(label_id: int, points: Sequence[float], **kwargs) -> models.Labele
     return shape(label_id, type="rectangle", points=points, **kwargs)
 
 
+def polygon(label_id: int, points: Sequence[float], **kwargs) -> models.LabeledShapeRequest:
+    """Helper factory function for LabeledShapeRequest with frame=0 and type="polygon"."""
+    return shape(label_id, type="polygon", points=points, **kwargs)
+
+
+def mask(label_id: int, points: Sequence[float], **kwargs) -> models.LabeledShapeRequest:
+    """
+    Helper factory function for LabeledShapeRequest with frame=0 and type="mask".
+
+    It's recommended to use the cvat.masks.encode_mask function to build the
+    points argument.
+    """
+    return shape(label_id, type="mask", points=points, **kwargs)
+
+
 def skeleton(
     label_id: int, elements: Sequence[models.SubLabeledShapeRequest], **kwargs
 ) -> models.LabeledShapeRequest:
diff --git a/cvat-sdk/cvat_sdk/masks.py b/cvat-sdk/cvat_sdk/masks.py
new file mode 100644
index 000000000000..f623aec7d043
--- /dev/null
+++ b/cvat-sdk/cvat_sdk/masks.py
@@ -0,0 +1,44 @@
+# Copyright (C) 2024 CVAT.ai Corporation
+#
+# SPDX-License-Identifier: MIT
+
+import math
+from collections.abc import Sequence
+
+import numpy as np
+from numpy.typing import ArrayLike
+
+
+def encode_mask(bitmap: ArrayLike, /, bbox: Sequence[float]) -> list[float]:
+    """
+    Encodes an image mask into an array of numbers suitable for the "points"
+    attribute of a LabeledShapeRequest object of type "mask".
+
+    bitmap must be a boolean array of shape (H, W), where H is the height and
+    W is the width of the image that the mask applies to.
+
+    bbox must have the form [x1, y1, x2, y2], where (0, 0) <= (x1, y1) < (x2, y2) <= (W, H).
+    The mask will be limited to points between (x1, y1) and (x2, y2).
+    """
+
+    bitmap = np.asanyarray(bitmap)
+    if bitmap.ndim != 2:
+        raise ValueError("bitmap must have 2 dimensions")
+    if bitmap.dtype != np.bool_:
+        raise ValueError("bitmap must have boolean items")
+
+    x1, y1 = map(math.floor, bbox[0:2])
+    x2, y2 = map(math.ceil, bbox[2:4])
+
+    if not (0 <= x1 < x2 <= bitmap.shape[1] and 0 <= y1 < y2 <= bitmap.shape[0]):
+        raise ValueError("bbox has invalid coordinates")
+
+    flat = bitmap[y1:y2, x1:x2].ravel()
+
+    (run_indices,) = np.diff(flat, prepend=[not flat[0]], append=[not flat[-1]]).nonzero()
+    if flat[0]:
+        run_lengths = np.diff(run_indices, prepend=[0])
+    else:
+        run_lengths = np.diff(run_indices)
+
+    return run_lengths.tolist() + [x1, y1, x2 - 1, y2 - 1]
diff --git a/cvat-sdk/gen/templates/openapi-generator/setup.mustache b/cvat-sdk/gen/templates/openapi-generator/setup.mustache
index eb89f5d20554..e0379cabd06e 100644
--- a/cvat-sdk/gen/templates/openapi-generator/setup.mustache
+++ b/cvat-sdk/gen/templates/openapi-generator/setup.mustache
@@ -77,7 +77,8 @@ setup(
     python_requires="{{{generatorLanguageVersion}}}",
     install_requires=BASE_REQUIREMENTS,
     extras_require={
-        "pytorch": ['torch', 'torchvision'],
+        "masks": ["numpy>=2"],
+        "pytorch": ['torch', 'torchvision', 'scikit-image>=0.24', 'cvat_sdk[masks]'],
     },
     package_dir={"": "."},
     packages=find_packages(include=["cvat_sdk*"]),
diff --git a/site/content/en/docs/api_sdk/sdk/_index.md b/site/content/en/docs/api_sdk/sdk/_index.md
index e9683583ab0e..e855dadd979f 100644
--- a/site/content/en/docs/api_sdk/sdk/_index.md
+++ b/site/content/en/docs/api_sdk/sdk/_index.md
@@ -42,7 +42,14 @@ To install an [official release of CVAT SDK](https://pypi.org/project/cvat-sdk/)
 pip install cvat-sdk
 ```
 
-To use the PyTorch adapter, request the `pytorch` extra:
+To use the `cvat_sdk.masks` module, request the `masks` extra:
+
+```bash
+pip install "cvat-sdk[masks]"
+```
+
+To use the PyTorch adapter or the built-in PyTorch-based auto-annotation functions,
+request the `pytorch` extra:
 
 ```bash
 pip install "cvat-sdk[pytorch]"
diff --git a/site/content/en/docs/api_sdk/sdk/auto-annotation.md b/site/content/en/docs/api_sdk/sdk/auto-annotation.md
index f97759efd175..d8401955da7f 100644
--- a/site/content/en/docs/api_sdk/sdk/auto-annotation.md
+++ b/site/content/en/docs/api_sdk/sdk/auto-annotation.md
@@ -181,10 +181,23 @@ The following helpers are available for use in `detect`:
 | Name        | Model type               | Fixed attributes              |
 |-------------|--------------------------|-------------------------------|
 | `shape`     | `LabeledShapeRequest`    | `frame=0`                     |
+| `mask`      | `LabeledShapeRequest`    | `frame=0`, `type="mask"`      |
+| `polygon`   | `LabeledShapeRequest`    | `frame=0`, `type="polygon"`   |
 | `rectangle` | `LabeledShapeRequest`    | `frame=0`, `type="rectangle"` |
 | `skeleton`  | `LabeledShapeRequest`    | `frame=0`, `type="skeleton"`  |
 | `keypoint`  | `SubLabeledShapeRequest` | `frame=0`, `type="points"`    |
 
+For `mask`, it is recommended to create the points list using
+the `cvat.masks.encode_mask` function, which will convert a bitmap into a
+list in the format that CVAT expects. For example:
+
+```python
+cvataa.mask(my_label, encode_mask(
+    my_mask,  # boolean 2D array, same size as the input image
+    [x1, y1, x2, y2],  # top left and bottom right coordinates of the mask
+))
+```
+
 ## Auto-annotation driver
 
 The `annotate_task` function uses an AA function to annotate a CVAT task.
@@ -257,10 +270,18 @@ The `create` function accepts the following parameters:
 It also accepts arbitrary additional parameters,
 which are passed directly to the model constructor.
 
+### `cvat_sdk.auto_annotation.functions.torchvision_instance_segmentation`
+
+This AA function is analogous to `torchvision_detection`,
+except it uses torchvision's instance segmentation models and produces mask
+or polygon annotations (depending on the value of `conv_mask_to_poly`).
+
+Refer to that function's description for usage instructions and parameter information.
+
 ### `cvat_sdk.auto_annotation.functions.torchvision_keypoint_detection`
 
 This AA function is analogous to `torchvision_detection`,
 except it uses torchvision's keypoint detection models and produces skeleton annotations.
 Keypoints which the model marks as invisible will be marked as occluded in CVAT.
 
-Refer to the previous section for usage instructions and parameter information.
+Refer to that function's description for usage instructions and parameter information.
diff --git a/tests/python/cli/cmtp_function.py b/tests/python/cli/cmtp_function.py
new file mode 100644
index 000000000000..2ae5cb26f663
--- /dev/null
+++ b/tests/python/cli/cmtp_function.py
@@ -0,0 +1,22 @@
+# Copyright (C) 2024 CVAT.ai Corporation
+#
+# SPDX-License-Identifier: MIT
+
+import cvat_sdk.auto_annotation as cvataa
+import cvat_sdk.models as models
+import PIL.Image
+
+spec = cvataa.DetectionFunctionSpec(
+    labels=[
+        cvataa.label_spec("car", 0),
+    ],
+)
+
+
+def detect(
+    context: cvataa.DetectionFunctionContext, image: PIL.Image.Image
+) -> list[models.LabeledShapeRequest]:
+    if context.conv_mask_to_poly:
+        return [cvataa.polygon(0, [0, 0, 0, 1, 1, 1])]
+    else:
+        return [cvataa.mask(0, [1, 0, 0, 0, 0])]
diff --git a/tests/python/cli/test_cli.py b/tests/python/cli/test_cli.py
index a039fd3744bc..f57775ca67ab 100644
--- a/tests/python/cli/test_cli.py
+++ b/tests/python/cli/test_cli.py
@@ -361,3 +361,25 @@ def test_auto_annotate_with_threshold(self, fxt_new_task: Task):
 
         annotations = fxt_new_task.get_annotations()
         assert annotations.shapes[0].points[0] == 0.75
+
+    def test_auto_annotate_with_cmtp(self, fxt_new_task: Task):
+        self.run_cli(
+            "auto-annotate",
+            str(fxt_new_task.id),
+            f"--function-module={__package__}.cmtp_function",
+            "--clear-existing",
+        )
+
+        annotations = fxt_new_task.get_annotations()
+        assert annotations.shapes[0].type.value == "mask"
+
+        self.run_cli(
+            "auto-annotate",
+            str(fxt_new_task.id),
+            f"--function-module={__package__}.cmtp_function",
+            "--clear-existing",
+            "--conv-mask-to-poly",
+        )
+
+        annotations = fxt_new_task.get_annotations()
+        assert annotations.shapes[0].type.value == "polygon"
diff --git a/tests/python/requirements.txt b/tests/python/requirements.txt
index 6ef44c0f5edb..5dfad3d6f7fb 100644
--- a/tests/python/requirements.txt
+++ b/tests/python/requirements.txt
@@ -4,9 +4,9 @@ pytest-cases==3.6.13
 pytest-timeout==2.1.0
 pytest-cov==4.1.0
 requests==2.32.2
-deepdiff==5.6.0
+deepdiff==7.0.1
 boto3==1.17.61
 Pillow==10.3.0
 python-dateutil==2.8.2
 pyyaml==6.0.0
-numpy==1.22.0
\ No newline at end of file
+numpy==2.0.0
diff --git a/tests/python/sdk/test_auto_annotation.py b/tests/python/sdk/test_auto_annotation.py
index 6fa96a5843f4..ff7302c1d9c5 100644
--- a/tests/python/sdk/test_auto_annotation.py
+++ b/tests/python/sdk/test_auto_annotation.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: MIT
 
 import io
+import math
 from logging import Logger
 from pathlib import Path
 from types import SimpleNamespace as namespace
@@ -307,6 +308,39 @@ def detect(
                     conf_threshold=bad_threshold,
                 )
 
+    def test_conv_mask_to_poly(self):
+        spec = cvataa.DetectionFunctionSpec(
+            labels=[
+                cvataa.label_spec("car", 123),
+            ],
+        )
+
+        received_cmtp = None
+
+        def detect(context, image: PIL.Image.Image) -> list[models.LabeledShapeRequest]:
+            nonlocal received_cmtp
+            received_cmtp = context.conv_mask_to_poly
+            return [cvataa.mask(123, [1, 0, 0, 0, 0])]
+
+        cvataa.annotate_task(
+            self.client,
+            self.task.id,
+            namespace(spec=spec, detect=detect),
+            conv_mask_to_poly=False,
+        )
+
+        assert received_cmtp is False
+
+        with pytest.raises(cvataa.BadFunctionError, match=".*conv_mask_to_poly.*"):
+            cvataa.annotate_task(
+                self.client,
+                self.task.id,
+                namespace(spec=spec, detect=detect),
+                conv_mask_to_poly=True,
+            )
+
+        assert received_cmtp is True
+
     def _test_bad_function_spec(self, spec: cvataa.DetectionFunctionSpec, exc_match: str) -> None:
         def detect(context, image):
             assert False
@@ -626,6 +660,60 @@ def fake_get_detection_model(name: str, weights, test_param):
 
         return FakeTorchvisionDetector(label_id=car_label_id)
 
+    class FakeTorchvisionInstanceSegmenter(nn.Module):
+        def __init__(self, label_id: int) -> None:
+            super().__init__()
+            self._label_id = label_id
+
+        def forward(self, images: list[torch.Tensor]) -> list[dict]:
+            assert isinstance(images, list)
+            assert all(isinstance(t, torch.Tensor) for t in images)
+
+            def make_box(im, a1, a2):
+                return [im.shape[2] * a1, im.shape[1] * a1, im.shape[2] * a2, im.shape[1] * a2]
+
+            def make_mask(im, a1, a2):
+                # creates a rectangular mask with a hole
+                mask = torch.full((1, im.shape[1], im.shape[2]), 0.49)
+                mask[
+                    0,
+                    math.ceil(im.shape[1] * a1) : math.floor(im.shape[1] * a2),
+                    math.ceil(im.shape[2] * a1) : math.floor(im.shape[2] * a2),
+                ] = 0.5
+                mask[
+                    0,
+                    math.ceil(im.shape[1] * a1) + 3 : math.floor(im.shape[1] * a2) - 3,
+                    math.ceil(im.shape[2] * a1) + 3 : math.floor(im.shape[2] * a2) - 3,
+                ] = 0.49
+                return mask
+
+            return [
+                {
+                    "labels": torch.tensor([self._label_id, self._label_id]),
+                    "boxes": torch.tensor(
+                        [
+                            make_box(im, 1 / 6, 1 / 3),
+                            make_box(im, 2 / 3, 5 / 6),
+                        ]
+                    ),
+                    "masks": torch.stack(
+                        [
+                            make_mask(im, 1 / 6, 1 / 3),
+                            make_mask(im, 2 / 3, 5 / 6),
+                        ]
+                    ),
+                    "scores": torch.tensor([0.75, 0.74]),
+                }
+                for im in images
+            ]
+
+    def fake_get_instance_segmentation_model(name: str, weights, test_param):
+        assert test_param == "expected_value"
+
+        car_label_id = weights.meta["categories"].index("car")
+
+        return FakeTorchvisionInstanceSegmenter(label_id=car_label_id)
+
     class FakeTorchvisionKeypointDetector(nn.Module):
         def __init__(self, label_id: int, keypoint_names: list[str]) -> None:
             super().__init__()
@@ -723,6 +811,54 @@ def test_torchvision_detection(self, monkeypatch: pytest.MonkeyPatch):
         assert annotations.shapes[0].type.value == "rectangle"
         assert annotations.shapes[0].points == [1, 2, 3, 4]
 
+    def test_torchvision_instance_segmentation(self, monkeypatch: pytest.MonkeyPatch):
+        monkeypatch.setattr(torchvision_models, "get_model", fake_get_instance_segmentation_model)
+
+        import cvat_sdk.auto_annotation.functions.torchvision_instance_segmentation as tis
+        from cvat_sdk.masks import encode_mask
+
+        cvataa.annotate_task(
+            self.client,
+            self.task.id,
+            tis.create("maskrcnn_resnet50_fpn_v2", "COCO_V1", test_param="expected_value"),
+            allow_unmatched_labels=True,
+            conf_threshold=0.75,
+        )
+
+        annotations = self.task.get_annotations()
+
+        assert len(annotations.shapes) == 1
+        assert self.task_labels_by_id[annotations.shapes[0].label_id].name == "car"
+
+        expected_bitmap = torch.zeros((100, 100), dtype=torch.bool)
+        expected_bitmap[17:33, 17:33] = True
+        expected_bitmap[20:30, 20:30] = False
+
+        assert annotations.shapes[0].type.value == "mask"
+        assert annotations.shapes[0].points == encode_mask(expected_bitmap, [16, 16, 34, 34])
+
+        cvataa.annotate_task(
+            self.client,
+            self.task.id,
+            tis.create("maskrcnn_resnet50_fpn_v2", "COCO_V1", test_param="expected_value"),
+            allow_unmatched_labels=True,
+            conf_threshold=0.75,
+            conv_mask_to_poly=True,
+            clear_existing=True,
+        )
+
+        annotations = self.task.get_annotations()
+
+        assert len(annotations.shapes) == 1
+        assert self.task_labels_by_id[annotations.shapes[0].label_id].name == "car"
+        assert annotations.shapes[0].type.value == "polygon"
+
+        # We shouldn't rely on the exact result of polygon conversion,
+        # since it depends on a 3rd-party library. Instead, we'll just
+        # check that all points are within the expected area.
+        for x, y in zip(*[iter(annotations.shapes[0].points)] * 2):
+            assert expected_bitmap[round(y), round(x)]
+
     def test_torchvision_keypoint_detection(self, monkeypatch: pytest.MonkeyPatch):
         monkeypatch.setattr(torchvision_models, "get_model", fake_get_keypoint_detection_model)
 
diff --git a/tests/python/sdk/test_masks.py b/tests/python/sdk/test_masks.py
new file mode 100644
index 000000000000..46e8b9f214cc
--- /dev/null
+++ b/tests/python/sdk/test_masks.py
@@ -0,0 +1,71 @@
+# Copyright (C) 2024 CVAT.ai Corporation
+#
+# SPDX-License-Identifier: MIT
+
+import pytest
+
+try:
+    import numpy as np
+    from cvat_sdk.masks import encode_mask
+
+except ModuleNotFoundError as e:
+    if e.name.split(".")[0] != "numpy":
+        raise
+
+    encode_mask = None
+
+
+@pytest.mark.skipif(encode_mask is None, reason="NumPy is not installed")
+class TestMasks:
+    def test_encode_mask(self):
+        bitmap = np.array(
+            [
+                np.fromstring("0 0 1 1 1 0", sep=" "),
+                np.fromstring("0 1 1 0 0 0", sep=" "),
+            ],
+            dtype=np.bool_,
+        )
+        bbox = [2.9, 0.9, 4.1, 1.1]  # will get rounded to [2, 0, 5, 2]
+
+        # There's slightly different logic for when the cropped mask starts with
+        # 0 and 1, so test both.
+        # This one starts with 1:
+        # 111
+        # 100
+
+        assert encode_mask(bitmap, bbox) == [0, 4, 2, 2, 0, 4, 1]
+
+        bbox = [1, 0, 5, 2]
+
+        # This one starts with 0:
+        # 0111
+        # 1100
+
+        assert encode_mask(bitmap, bbox) == [1, 5, 2, 1, 0, 4, 1]
+
+        # Edge case: full image
+        bbox = [0, 0, 6, 2]
+        assert encode_mask(bitmap, bbox) == [2, 3, 2, 2, 3, 0, 0, 5, 1]
+
+    def test_encode_mask_invalid_dim(self):
+        with pytest.raises(ValueError, match="bitmap must have 2 dimensions"):
+            encode_mask([True], [0, 0, 1, 1])
+
+    def test_encode_mask_invalid_dtype(self):
+        with pytest.raises(ValueError, match="bitmap must have boolean items"):
+            encode_mask([[1]], [0, 0, 1, 1])
+
+    @pytest.mark.parametrize(
+        "bbox",
+        [
+            [-0.1, 0, 1, 1],
+            [0, -0.1, 1, 1],
+            [0, 0, 1.1, 1],
+            [0, 0, 1, 1.1],
+            [1, 0, 0, 1],
+            [0, 1, 1, 0],
+        ],
+    )
+    def test_encode_mask_invalid_bbox(self, bbox):
+        with pytest.raises(ValueError, match="bbox has invalid coordinates"):
+            encode_mask([[True]], bbox)