diff --git a/app/packages/core/src/plugins/SchemaIO/components/NativeModelEvaluationView/Evaluation.tsx b/app/packages/core/src/plugins/SchemaIO/components/NativeModelEvaluationView/Evaluation.tsx
index f9c6ade6d4b..75dfa5e3b43 100644
--- a/app/packages/core/src/plugins/SchemaIO/components/NativeModelEvaluationView/Evaluation.tsx
+++ b/app/packages/core/src/plugins/SchemaIO/components/NativeModelEvaluationView/Evaluation.tsx
@@ -1783,11 +1783,8 @@ type SummaryRow = {
 
 function formatCustomMetricRows(evaluationMetrics, comparisonMetrics) {
   const results = [] as SummaryRow[];
-  const customMetrics = _.get(
-    evaluationMetrics,
-    "custom_metrics",
-    {}
-  ) as CustomMetrics;
+  const customMetrics = (_.get(evaluationMetrics, "custom_metrics", null) ||
+    {}) as CustomMetrics;
   for (const [operatorUri, customMetric] of Object.entries(customMetrics)) {
     const compareValue = _.get(
       comparisonMetrics,
diff --git a/fiftyone/core/fields.py b/fiftyone/core/fields.py
index 4a9e7415730..eb401ea9f64 100644
--- a/fiftyone/core/fields.py
+++ b/fiftyone/core/fields.py
@@ -1621,6 +1621,68 @@ def is_rgb_target(target):
     )
 
 
+def hex_to_int(hex_str):
+    """Converts a hex string like `"#ff6d04"` to a hex integer.
+
+    Args:
+        hex_str: a hex string
+
+    Returns:
+        an integer
+    """
+    r = int(hex_str[1:3], 16)
+    g = int(hex_str[3:5], 16)
+    b = int(hex_str[5:7], 16)
+    return (r << 16) + (g << 8) + b
+
+
+def int_to_hex(value):
+    """Converts an RRGGBB integer value to hex string like `"#ff6d04"`.
+
+    Args:
+        value: an integer value
+
+    Returns:
+        a hex string
+    """
+    r = (value >> 16) & 255
+    g = (value >> 8) & 255
+    b = value & 255
+    return "#%02x%02x%02x" % (r, g, b)
+
+
+def rgb_array_to_int(mask):
+    """Converts an RGB mask array to a 2D hex integer mask array.
+
+    Args:
+        mask: an RGB mask array
+
+    Returns:
+        a 2D integer mask array
+    """
+    return (
+        np.left_shift(mask[:, :, 0], 16, dtype=int)
+        + np.left_shift(mask[:, :, 1], 8, dtype=int)
+        + mask[:, :, 2]
+    )
+
+
+def int_array_to_rgb(mask):
+    """Converts a 2D hex integer mask array to an RGB mask array.
+
+    Args:
+        mask: a 2D integer mask array
+
+    Returns:
+        an RGB mask array
+    """
+    return np.stack(
+        [(mask >> 16) & 255, (mask >> 8) & 255, mask & 255],
+        axis=2,
+        dtype=np.uint8,
+    )
+
+
 class EmbeddedDocumentField(mongoengine.fields.EmbeddedDocumentField, Field):
     """A field that stores instances of a given type of
     :class:`fiftyone.core.odm.BaseEmbeddedDocument` object.
diff --git a/fiftyone/server/utils.py b/fiftyone/server/utils.py
index 6104f35c73a..31448b45d8d 100644
--- a/fiftyone/server/utils.py
+++ b/fiftyone/server/utils.py
@@ -49,6 +49,21 @@ def load_and_cache_dataset(name):
     return dataset
 
 
+def cache_dataset(dataset):
+    """Caches the given dataset.
+
+    This method ensures that subsequent calls to
+    :func:`fiftyone.core.dataset.load_dataset` in async calls will return this
+    dataset singleton.
+
+    See :meth:`load_and_cache_dataset` for additional details.
+
+    Args:
+        dataset: a :class:`fiftyone.core.dataset.Dataset`
+    """
+    _cache[dataset.name] = dataset
+
+
 def change_sample_tags(sample_collection, changes):
     """Applies the changes to tags to all samples of the collection, if
     necessary.
diff --git a/fiftyone/utils/eval/segmentation.py b/fiftyone/utils/eval/segmentation.py
index 2a81dadc999..7bebee10f84 100644
--- a/fiftyone/utils/eval/segmentation.py
+++ b/fiftyone/utils/eval/segmentation.py
@@ -8,6 +8,7 @@
 from copy import deepcopy
 import logging
 import inspect
+import itertools
 import warnings
 
 import numpy as np
@@ -369,7 +370,7 @@ def evaluate_samples(
         if mask_targets is not None:
             if fof.is_rgb_mask_targets(mask_targets):
                 mask_targets = {
-                    _hex_to_int(k): v for k, v in mask_targets.items()
+                    fof.hex_to_int(k): v for k, v in mask_targets.items()
                 }
 
             values, classes = zip(*sorted(mask_targets.items()))
@@ -385,6 +386,7 @@ def evaluate_samples(
 
         nc = len(values)
         confusion_matrix = np.zeros((nc, nc), dtype=int)
+        matches = []
 
         bandwidth = self.config.bandwidth
         average = self.config.average
@@ -427,6 +429,17 @@ def evaluate_samples(
                 )
                 sample_conf_mat += image_conf_mat
 
+                for i, j in zip(*np.nonzero(image_conf_mat)):
+                    matches.append(
+                        (
+                            classes[i],
+                            classes[j],
+                            int(image_conf_mat[i, j]),
+                            gt_seg.id,
+                            pred_seg.id,
+                        )
+                    )
+
                 if processing_frames and save:
                     facc, fpre, frec = _compute_accuracy_precision_recall(
                         image_conf_mat, values, average
@@ -460,6 +473,7 @@ def evaluate_samples(
             eval_key,
             confusion_matrix,
             classes,
+            matches=matches,
             missing=missing,
             backend=self,
         )
@@ -474,6 +488,9 @@ class SegmentationResults(BaseClassificationResults):
         eval_key: the evaluation key
         pixel_confusion_matrix: a pixel value confusion matrix
         classes: a list of class labels corresponding to the confusion matrix
+        matches (None): a list of
+            ``(gt_label, pred_label, pixel_count, gt_id, pred_id)``
+            matches
         missing (None): a missing (background) class
         custom_metrics (None): an optional dict of custom metrics
         backend (None): a :class:`SegmentationEvaluation` backend
@@ -486,14 +503,23 @@ def __init__(
         eval_key,
         pixel_confusion_matrix,
         classes,
+        matches=None,
         missing=None,
         custom_metrics=None,
         backend=None,
     ):
         pixel_confusion_matrix = np.asarray(pixel_confusion_matrix)
-        ytrue, ypred, weights = self._parse_confusion_matrix(
-            pixel_confusion_matrix, classes
-        )
+
+        if matches is None:
+            ytrue, ypred, weights = self._parse_confusion_matrix(
+                pixel_confusion_matrix, classes
+            )
+            ytrue_ids = None
+            ypred_ids = None
+        elif matches:
+            ytrue, ypred, weights, ytrue_ids, ypred_ids = zip(*matches)
+        else:
+            ytrue, ypred, weights, ytrue_ids, ypred_ids = [], [], [], [], []
 
         super().__init__(
             samples,
@@ -502,6 +528,8 @@ def __init__(
             ytrue,
             ypred,
             weights=weights,
+            ytrue_ids=ytrue_ids,
+            ypred_ids=ypred_ids,
             classes=classes,
             missing=missing,
             custom_metrics=custom_metrics,
@@ -510,15 +538,6 @@ def __init__(
 
         self.pixel_confusion_matrix = pixel_confusion_matrix
 
-    def attributes(self):
-        return [
-            "cls",
-            "pixel_confusion_matrix",
-            "classes",
-            "missing",
-            "custom_metrics",
-        ]
-
     def dice_score(self):
         """Computes the Dice score across all samples in the evaluation.
 
@@ -529,12 +548,31 @@ def dice_score(self):
 
     @classmethod
     def _from_dict(cls, d, samples, config, eval_key, **kwargs):
+        ytrue = d.get("ytrue", None)
+        ypred = d.get("ypred", None)
+        weights = d.get("weights", None)
+        ytrue_ids = d.get("ytrue_ids", None)
+        ypred_ids = d.get("ypred_ids", None)
+
+        if ytrue is not None and ypred is not None and weights is not None:
+            if ytrue_ids is None:
+                ytrue_ids = itertools.repeat(None)
+
+            if ypred_ids is None:
+                ypred_ids = itertools.repeat(None)
+
+            matches = list(zip(ytrue, ypred, weights, ytrue_ids, ypred_ids))
+        else:
+            # Legacy format segmentations
+            matches = None
+
         return cls(
             samples,
             config,
             eval_key,
             d["pixel_confusion_matrix"],
             d["classes"],
+            matches=matches,
             missing=d.get("missing", None),
             custom_metrics=d.get("custom_metrics", None),
             **kwargs,
@@ -599,10 +637,10 @@ def _compute_pixel_confusion_matrix(
     pred_mask, gt_mask, values, bandwidth=None
 ):
     if pred_mask.ndim == 3:
-        pred_mask = _rgb_array_to_int(pred_mask)
+        pred_mask = fof.rgb_array_to_int(pred_mask)
 
     if gt_mask.ndim == 3:
-        gt_mask = _rgb_array_to_int(gt_mask)
+        gt_mask = fof.rgb_array_to_int(gt_mask)
 
     if pred_mask.shape != gt_mask.shape:
         msg = (
@@ -675,37 +713,15 @@ def _get_mask_values(samples, pred_field, gt_field, progress=None):
                     mask = seg.get_mask()
                     if mask.ndim == 3:
                         is_rgb = True
-                        mask = _rgb_array_to_int(mask)
+                        mask = fof.rgb_array_to_int(mask)
 
                     values.update(mask.ravel())
 
     values = sorted(values)
 
     if is_rgb:
-        classes = [_int_to_hex(v) for v in values]
+        classes = [fof.int_to_hex(v) for v in values]
     else:
         classes = [str(v) for v in values]
 
     return values, classes
-
-
-def _rgb_array_to_int(mask):
-    return (
-        np.left_shift(mask[:, :, 0], 16, dtype=int)
-        + np.left_shift(mask[:, :, 1], 8, dtype=int)
-        + mask[:, :, 2]
-    )
-
-
-def _hex_to_int(hex_str):
-    r = int(hex_str[1:3], 16)
-    g = int(hex_str[3:5], 16)
-    b = int(hex_str[5:7], 16)
-    return (r << 16) + (g << 8) + b
-
-
-def _int_to_hex(value):
-    r = (value >> 16) & 255
-    g = (value >> 8) & 255
-    b = value & 255
-    return "#%02x%02x%02x" % (r, g, b)
diff --git a/plugins/panels/model_evaluation/__init__.py b/plugins/panels/model_evaluation/__init__.py
index 35e850d6415..31849ff94e0 100644
--- a/plugins/panels/model_evaluation/__init__.py
+++ b/plugins/panels/model_evaluation/__init__.py
@@ -10,9 +10,11 @@
 import os
 import traceback
 
+from bson import ObjectId
 import numpy as np
 
 from fiftyone import ViewField as F
+import fiftyone.core.fields as fof
 from fiftyone.operators.categories import Categories
 from fiftyone.operators.panel import Panel, PanelConfig
 from fiftyone.core.plots.plotly import _to_log_colorscale
@@ -321,16 +323,6 @@ def get_confusion_matrices(self, results):
             "lc_colorscale": lc_colorscale,
         }
 
-    def get_mask_targets(self, dataset, gt_field):
-        mask_targets = dataset.mask_targets.get(gt_field, None)
-        if mask_targets:
-            return mask_targets
-
-        if dataset.default_mask_targets:
-            return dataset.default_mask_targets
-
-        return None
-
     def load_evaluation(self, ctx):
         view_state = ctx.panel.get_state("view") or {}
         eval_key = view_state.get("key")
@@ -351,13 +343,15 @@ def load_evaluation(self, ctx):
                     {"error": "unsupported", "info": serialized_info},
                 )
                 return
-            gt_field = info.config.gt_field
-            mask_targets = (
-                self.get_mask_targets(ctx.dataset, gt_field)
-                if evaluation_type == "segmentation"
-                else None
-            )
+
             results = ctx.dataset.load_evaluation_results(computed_eval_key)
+            gt_field = info.config.gt_field
+            mask_targets = None
+
+            if evaluation_type == "segmentation":
+                mask_targets = _get_mask_targets(ctx.dataset, gt_field)
+                _init_segmentation_results(ctx.dataset, results, gt_field)
+
             metrics = results.metrics()
             per_class_metrics = self.get_per_class_metrics(info, results)
             metrics["average_confidence"] = self.get_avg_confidence(
@@ -592,6 +586,78 @@ def load_view(self, ctx):
                     view = eval_view.filter_labels(
                         pred_field, F(eval_key) == field, only_matches=True
                     )
+        elif info.config.type == "segmentation":
+            results = ctx.dataset.load_evaluation_results(eval_key)
+            _init_segmentation_results(ctx.dataset, results, gt_field)
+            if results.ytrue_ids is None or results.ypred_ids is None:
+                # Legacy format segmentations
+                return
+
+            if eval_key2:
+                if gt_field2 is None:
+                    gt_field2 = gt_field
+
+                results2 = ctx.dataset.load_evaluation_results(eval_key2)
+                _init_segmentation_results(ctx.dataset, results2, gt_field2)
+                if results2.ytrue_ids is None or results2.ypred_ids is None:
+                    # Legacy format segmentations
+                    return
+            else:
+                results2 = None
+
+            _, gt_id = ctx.dataset._get_label_field_path(gt_field, "_id")
+            _, pred_id = ctx.dataset._get_label_field_path(pred_field, "_id")
+            if gt_field2 is not None:
+                _, gt_id2 = ctx.dataset._get_label_field_path(gt_field2, "_id")
+            if pred_field2 is not None:
+                _, pred_id2 = ctx.dataset._get_label_field_path(
+                    pred_field2, "_id"
+                )
+
+            if view_type == "class":
+                # All GT/predictions that contain class `x`
+                ytrue_ids, ypred_ids = _get_segmentation_class_ids(results, x)
+                expr = F(gt_id).is_in(ytrue_ids)
+                expr |= F(pred_id).is_in(ypred_ids)
+                if results2 is not None:
+                    ytrue_ids2, ypred_ids2 = _get_segmentation_class_ids(
+                        results2, x
+                    )
+                    expr |= F(gt_id2).is_in(ytrue_ids2)
+                    expr |= F(pred_id2).is_in(ypred_ids2)
+
+                view = eval_view.match(expr)
+            elif view_type == "matrix":
+                # Specific confusion matrix cell
+                ytrue_ids, ypred_ids = _get_segmentation_conf_mat_ids(
+                    results, x, y
+                )
+                expr = F(gt_id).is_in(ytrue_ids)
+                expr &= F(pred_id).is_in(ypred_ids)
+                view = eval_view.match(expr)
+            elif view_type == "field":
+                if field == "tp":
+                    # All true positives
+                    ytrue_ids, ypred_ids = _get_segmentation_tp_fp_fn_ids(
+                        results, field
+                    )
+                    expr = F(gt_id).is_in(ytrue_ids)
+                    expr &= F(pred_id).is_in(ypred_ids)
+                    view = eval_view.match(expr)
+                elif field == "fn":
+                    # All false negatives
+                    ytrue_ids, _ = _get_segmentation_tp_fp_fn_ids(
+                        results, field
+                    )
+                    expr = F(gt_id).is_in(ytrue_ids)
+                    view = eval_view.match(expr)
+                else:
+                    # All false positives
+                    _, ypred_ids = _get_segmentation_tp_fp_fn_ids(
+                        results, field
+                    )
+                    expr = F(pred_id).is_in(ypred_ids)
+                    view = eval_view.match(expr)
 
         if view is not None:
             ctx.ops.set_view(view)
@@ -612,3 +678,127 @@ def render(self, ctx):
                 load_view=self.load_view,
             ),
         )
+
+
+def _get_mask_targets(dataset, gt_field):
+    mask_targets = dataset.mask_targets.get(gt_field, None)
+    if mask_targets:
+        return mask_targets
+
+    if dataset.default_mask_targets:
+        return dataset.default_mask_targets
+
+    return None
+
+
+def _init_segmentation_results(dataset, results, gt_field):
+    if results.ytrue_ids is None or results.ypred_ids is None:
+        # Legacy format segmentations
+        return
+
+    if getattr(results, "_classes_map", None):
+        # Already initialized
+        return
+
+    #
+    # Ensure the dataset singleton is cached so that subsequent callbacks on
+    # this panel will use the same `dataset` and hence `results`
+    #
+
+    import fiftyone.server.utils as fosu
+
+    fosu.cache_dataset(dataset)
+
+    #
+    # `results.classes` and App callbacks could contain any of the
+    # following:
+    #  1. stringified pixel values
+    #  2. RGB hex strings
+    #  3. label strings
+    #
+    # so we must construct `classes_map` that can map any of these possible
+    # values to integer indexes
+    #
+    classes_map = {c: i for i, c in enumerate(results.classes)}
+
+    mask_targets = _get_mask_targets(dataset, gt_field)
+    if mask_targets is not None:
+        # `str()` handles cases 1 and 2, and `.get(c, c)` handles case 3
+        mask_targets = {str(k): v for k, v in mask_targets.items()}
+        classes = [mask_targets.get(c, c) for c in results.classes]
+        classes_map.update({c: i for i, c in enumerate(classes)})
+
+    #
+    # Generate mapping from `(i, j)` to ID lists for use in App callbacks
+    #
+
+    ytrue_ids_dict = {}
+    ypred_ids_dict = {}
+    for ytrue, ypred, ytrue_id, ypred_id in zip(
+        results.ytrue, results.ypred, results.ytrue_ids, results.ypred_ids
+    ):
+        i = classes_map[ytrue]
+        j = classes_map[ypred]
+        index = (i, j)
+
+        if index not in ytrue_ids_dict:
+            ytrue_ids_dict[index] = []
+        ytrue_ids_dict[index].append(ytrue_id)
+
+        if index not in ypred_ids_dict:
+            ypred_ids_dict[index] = []
+        ypred_ids_dict[index].append(ypred_id)
+
+    results._classes_map = classes_map
+    results._ytrue_ids_dict = ytrue_ids_dict
+    results._ypred_ids_dict = ypred_ids_dict
+
+
+def _get_segmentation_class_ids(results, x):
+    k = results._classes_map[x]
+    nrows, ncols = results.pixel_confusion_matrix.shape
+
+    ytrue_ids = []
+    for j in range(ncols):
+        _ytrue_ids = results._ytrue_ids_dict.get((k, j), None)
+        if _ytrue_ids is not None:
+            ytrue_ids.extend(_ytrue_ids)
+
+    ypred_ids = []
+    for i in range(nrows):
+        _ypred_ids = results._ypred_ids_dict.get((i, k), None)
+        if _ypred_ids is not None:
+            ypred_ids.extend(_ypred_ids)
+
+    return _to_object_ids(ytrue_ids), _to_object_ids(ypred_ids)
+
+
+def _get_segmentation_conf_mat_ids(results, x, y):
+    i = results._classes_map[x]
+    j = results._classes_map[y]
+    ytrue_ids = _to_object_ids(results._ytrue_ids_dict.get((i, j), []))
+    ypred_ids = _to_object_ids(results._ypred_ids_dict.get((i, j), []))
+    return ytrue_ids, ypred_ids
+
+
+def _get_segmentation_tp_fp_fn_ids(results, field):
+    if field == "tp":
+        # True positives
+        inds = results.ytrue == results.ypred
+        ytrue_ids = _to_object_ids(results.ytrue_ids[inds])
+        ypred_ids = _to_object_ids(results.ypred_ids[inds])
+        return ytrue_ids, ypred_ids
+    elif field == "fn":
+        # False negatives
+        inds = results.ypred == results.missing
+        ytrue_ids = _to_object_ids(results.ytrue_ids[inds])
+        return ytrue_ids, None
+    else:
+        # False positives
+        inds = results.ytrue == results.missing
+        ypred_ids = _to_object_ids(results.ypred_ids[inds])
+        return None, ypred_ids
+
+
+def _to_object_ids(ids):
+    return [ObjectId(_id) for _id in ids]