From 18281148ea79210dca737e7e18cf56cf7076416b Mon Sep 17 00:00:00 2001
From: prernadh <dhareshwar.prerna@gmail.com>
Date: Fri, 27 Dec 2024 12:04:24 -0700
Subject: [PATCH 1/6] Adding Seg changes

---
 fiftyone/utils/eval/segmentation.py | 52 +++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 10 deletions(-)

diff --git a/fiftyone/utils/eval/segmentation.py b/fiftyone/utils/eval/segmentation.py
index 2a81dadc999..95f67decf0c 100644
--- a/fiftyone/utils/eval/segmentation.py
+++ b/fiftyone/utils/eval/segmentation.py
@@ -385,6 +385,8 @@ def evaluate_samples(
 
         nc = len(values)
         confusion_matrix = np.zeros((nc, nc), dtype=int)
+        ypred_ids = {}
+        ytrue_ids = {}
 
         bandwidth = self.config.bandwidth
         average = self.config.average
@@ -426,6 +428,16 @@ def evaluate_samples(
                     bandwidth=bandwidth,
                 )
                 sample_conf_mat += image_conf_mat
+                non_zero_indexes = np.nonzero(sample_conf_mat)
+                for index in zip(*non_zero_indexes):
+                    if index not in ypred_ids:
+                        ypred_ids[index] = [pred_seg.id]
+                    else:
+                        ypred_ids[index].append(pred_seg.id)
+                    if index not in ytrue_ids:
+                        ytrue_ids[index] = [gt_seg.id]
+                    else:
+                        ytrue_ids[index].append(gt_seg.id)
 
                 if processing_frames and save:
                     facc, fpre, frec = _compute_accuracy_precision_recall(
@@ -454,15 +466,18 @@ def evaluate_samples(
         else:
             missing = None
 
-        return SegmentationResults(
+        res = SegmentationResults(
             samples,
             self.config,
             eval_key,
             confusion_matrix,
             classes,
+            ypred_ids=ypred_ids,
+            ytrue_ids=ytrue_ids,
             missing=missing,
             backend=self,
         )
+        return res
 
 
 class SegmentationResults(BaseClassificationResults):
@@ -486,13 +501,15 @@ def __init__(
         eval_key,
         pixel_confusion_matrix,
         classes,
+        ypred_ids=None,
+        ytrue_ids=None,
         missing=None,
         custom_metrics=None,
         backend=None,
     ):
         pixel_confusion_matrix = np.asarray(pixel_confusion_matrix)
-        ytrue, ypred, weights = self._parse_confusion_matrix(
-            pixel_confusion_matrix, classes
+        ytrue, ypred, weights, ytrue_ids, ypred_ids = self._parse_confusion_matrix(
+            pixel_confusion_matrix, classes, ypred_ids, ytrue_ids
         )
 
         super().__init__(
@@ -502,6 +519,8 @@ def __init__(
             ytrue,
             ypred,
             weights=weights,
+            ytrue_ids=ytrue_ids,
+            ypred_ids=ypred_ids,
             classes=classes,
             missing=missing,
             custom_metrics=custom_metrics,
@@ -541,21 +560,34 @@ def _from_dict(cls, d, samples, config, eval_key, **kwargs):
         )
 
     @staticmethod
-    def _parse_confusion_matrix(confusion_matrix, classes):
+    def _parse_confusion_matrix(confusion_matrix, classes, ytrue_ids_dict, ypred_ids_dict):
         ytrue = []
         ypred = []
         weights = []
+        ytrue_ids = None
+        ypred_ids = None
+        if ytrue_ids_dict is not None and ypred_ids_dict is not None:
+            ytrue_ids = []
+            ypred_ids = []
         nrows, ncols = confusion_matrix.shape
         for i in range(nrows):
             for j in range(ncols):
+                index = (i, j)
                 cij = confusion_matrix[i, j]
                 if cij > 0:
-                    ytrue.append(classes[i])
-                    ypred.append(classes[j])
-                    weights.append(cij)
-
-        return ytrue, ypred, weights
-
+                    if ytrue_ids_dict is not None and ypred_ids_dict is not None:
+                        ytrue_ids+= ytrue_ids_dict[index]
+                        ypred_ids+= ypred_ids_dict[index]
+                        ytrue_multiplier = len(ytrue_ids_dict[index])
+                    else:
+                        ytrue_multiplier = 1
+                    for p in range(ytrue_multiplier):
+                        ytrue.append(classes[i])
+                        ypred.append(classes[j])
+                        weights.append(cij/ytrue_multiplier)
+                    #Note: The weights aren't equally divided by the different ytrue_ids, but it works out for the confusion matrix calculations.
+                    
+        return ytrue, ypred, weights, ytrue_ids, ypred_ids
 
 def _parse_config(pred_field, gt_field, method, **kwargs):
     if method is None:

From c7fd15701385d38d2b3ff40b44135a4cdd700802 Mon Sep 17 00:00:00 2001
From: prernadh <dhareshwar.prerna@gmail.com>
Date: Fri, 27 Dec 2024 12:18:56 -0700
Subject: [PATCH 2/6] Reverting new variable

---
 fiftyone/utils/eval/segmentation.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fiftyone/utils/eval/segmentation.py b/fiftyone/utils/eval/segmentation.py
index 95f67decf0c..94e6bb82d1a 100644
--- a/fiftyone/utils/eval/segmentation.py
+++ b/fiftyone/utils/eval/segmentation.py
@@ -466,7 +466,7 @@ def evaluate_samples(
         else:
             missing = None
 
-        res = SegmentationResults(
+        return SegmentationResults(
             samples,
             self.config,
             eval_key,
@@ -477,7 +477,6 @@ def evaluate_samples(
             missing=missing,
             backend=self,
         )
-        return res
 
 
 class SegmentationResults(BaseClassificationResults):

From 1bb601d00f990a42e7ca7b7b56121604e7bf7a3a Mon Sep 17 00:00:00 2001
From: brimoor <brimoor@umich.edu>
Date: Tue, 31 Dec 2024 14:27:49 -0500
Subject: [PATCH 3/6] implement callbacks

---
 fiftyone/core/fields.py                     |  62 ++++++++
 fiftyone/utils/eval/segmentation.py         | 150 ++++++++++++--------
 plugins/panels/model_evaluation/__init__.py | 130 +++++++++++++++++
 3 files changed, 283 insertions(+), 59 deletions(-)

diff --git a/fiftyone/core/fields.py b/fiftyone/core/fields.py
index 4a9e7415730..eb401ea9f64 100644
--- a/fiftyone/core/fields.py
+++ b/fiftyone/core/fields.py
@@ -1621,6 +1621,68 @@ def is_rgb_target(target):
     )
 
 
+def hex_to_int(hex_str):
+    """Converts a hex string like `"#ff6d04"` to a hex integer.
+
+    Args:
+        hex_str: a hex string
+
+    Returns:
+        an integer
+    """
+    r = int(hex_str[1:3], 16)
+    g = int(hex_str[3:5], 16)
+    b = int(hex_str[5:7], 16)
+    return (r << 16) + (g << 8) + b
+
+
+def int_to_hex(value):
+    """Converts an RRGGBB integer value to hex string like `"#ff6d04"`.
+
+    Args:
+        value: an integer value
+
+    Returns:
+        a hex string
+    """
+    r = (value >> 16) & 255
+    g = (value >> 8) & 255
+    b = value & 255
+    return "#%02x%02x%02x" % (r, g, b)
+
+
+def rgb_array_to_int(mask):
+    """Converts an RGB mask array to a 2D hex integer mask array.
+
+    Args:
+        mask: an RGB mask array
+
+    Returns:
+        a 2D integer mask array
+    """
+    return (
+        np.left_shift(mask[:, :, 0], 16, dtype=int)
+        + np.left_shift(mask[:, :, 1], 8, dtype=int)
+        + mask[:, :, 2]
+    )
+
+
+def int_array_to_rgb(mask):
+    """Converts a 2D hex integer mask array to an RGB mask array.
+
+    Args:
+        mask: a 2D integer mask array
+
+    Returns:
+        an RGB mask array
+    """
+    return np.stack(
+        [(mask >> 16) & 255, (mask >> 8) & 255, mask & 255],
+        axis=2,
+        dtype=np.uint8,
+    )
+
+
 class EmbeddedDocumentField(mongoengine.fields.EmbeddedDocumentField, Field):
     """A field that stores instances of a given type of
     :class:`fiftyone.core.odm.BaseEmbeddedDocument` object.
diff --git a/fiftyone/utils/eval/segmentation.py b/fiftyone/utils/eval/segmentation.py
index 94e6bb82d1a..68196888f30 100644
--- a/fiftyone/utils/eval/segmentation.py
+++ b/fiftyone/utils/eval/segmentation.py
@@ -369,7 +369,7 @@ def evaluate_samples(
         if mask_targets is not None:
             if fof.is_rgb_mask_targets(mask_targets):
                 mask_targets = {
-                    _hex_to_int(k): v for k, v in mask_targets.items()
+                    fof.hex_to_int(k): v for k, v in mask_targets.items()
                 }
 
             values, classes = zip(*sorted(mask_targets.items()))
@@ -385,8 +385,9 @@ def evaluate_samples(
 
         nc = len(values)
         confusion_matrix = np.zeros((nc, nc), dtype=int)
-        ypred_ids = {}
-        ytrue_ids = {}
+        weights_dict = {}
+        ytrue_ids_dict = {}
+        ypred_ids_dict = {}
 
         bandwidth = self.config.bandwidth
         average = self.config.average
@@ -428,16 +429,19 @@ def evaluate_samples(
                     bandwidth=bandwidth,
                 )
                 sample_conf_mat += image_conf_mat
-                non_zero_indexes = np.nonzero(sample_conf_mat)
-                for index in zip(*non_zero_indexes):
-                    if index not in ypred_ids:
-                        ypred_ids[index] = [pred_seg.id]
-                    else:
-                        ypred_ids[index].append(pred_seg.id)
-                    if index not in ytrue_ids:
-                        ytrue_ids[index] = [gt_seg.id]
-                    else:
-                        ytrue_ids[index].append(gt_seg.id)
+
+                for index in zip(*np.nonzero(image_conf_mat)):
+                    if index not in weights_dict:
+                        weights_dict[index] = []
+                    weights_dict[index].append(int(image_conf_mat[index]))
+
+                    if index not in ytrue_ids_dict:
+                        ytrue_ids_dict[index] = []
+                    ytrue_ids_dict[index].append(gt_seg.id)
+
+                    if index not in ypred_ids_dict:
+                        ypred_ids_dict[index] = []
+                    ypred_ids_dict[index].append(pred_seg.id)
 
                 if processing_frames and save:
                     facc, fpre, frec = _compute_accuracy_precision_recall(
@@ -472,8 +476,9 @@ def evaluate_samples(
             eval_key,
             confusion_matrix,
             classes,
-            ypred_ids=ypred_ids,
-            ytrue_ids=ytrue_ids,
+            weights_dict=weights_dict,
+            ytrue_ids_dict=ytrue_ids_dict,
+            ypred_ids_dict=ypred_ids_dict,
             missing=missing,
             backend=self,
         )
@@ -488,6 +493,11 @@ class SegmentationResults(BaseClassificationResults):
         eval_key: the evaluation key
         pixel_confusion_matrix: a pixel value confusion matrix
         classes: a list of class labels corresponding to the confusion matrix
+        weights_dict (None): a dict mapping ``(i, j)`` tuples to pixel counts
+        ytrue_ids_dict (None): a dict mapping ``(i, j)`` tuples to lists of
+            ground truth IDs
+        ypred_ids_dict (None): a dict mapping ``(i, j)`` tuples to lists of
+            predicted label IDs
         missing (None): a missing (background) class
         custom_metrics (None): an optional dict of custom metrics
         backend (None): a :class:`SegmentationEvaluation` backend
@@ -500,15 +510,27 @@ def __init__(
         eval_key,
         pixel_confusion_matrix,
         classes,
-        ypred_ids=None,
-        ytrue_ids=None,
+        weights_dict=None,
+        ytrue_ids_dict=None,
+        ypred_ids_dict=None,
         missing=None,
         custom_metrics=None,
         backend=None,
     ):
         pixel_confusion_matrix = np.asarray(pixel_confusion_matrix)
-        ytrue, ypred, weights, ytrue_ids, ypred_ids = self._parse_confusion_matrix(
-            pixel_confusion_matrix, classes, ypred_ids, ytrue_ids
+
+        (
+            ytrue,
+            ypred,
+            weights,
+            ytrue_ids,
+            ypred_ids,
+        ) = self._parse_confusion_matrix(
+            pixel_confusion_matrix,
+            classes,
+            weights_dict=weights_dict,
+            ytrue_ids_dict=ytrue_ids_dict,
+            ypred_ids_dict=ypred_ids_dict,
         )
 
         super().__init__(
@@ -527,12 +549,18 @@ def __init__(
         )
 
         self.pixel_confusion_matrix = pixel_confusion_matrix
+        self.weights_dict = weights_dict
+        self.ytrue_ids_dict = ytrue_ids_dict
+        self.ypred_ids_dict = ypred_ids_dict
 
     def attributes(self):
         return [
             "cls",
             "pixel_confusion_matrix",
             "classes",
+            "weights_dict",
+            "ytrue_ids_dict",
+            "ypred_ids_dict",
             "missing",
             "custom_metrics",
         ]
@@ -553,41 +581,67 @@ def _from_dict(cls, d, samples, config, eval_key, **kwargs):
             eval_key,
             d["pixel_confusion_matrix"],
             d["classes"],
+            weights_dict=_parse_index_dict(d.get("weights_dict", None)),
+            ytrue_ids_dict=_parse_index_dict(d.get("ytrue_ids_dict", None)),
+            ypred_ids_dict=_parse_index_dict(d.get("ypred_ids_dict", None)),
             missing=d.get("missing", None),
             custom_metrics=d.get("custom_metrics", None),
             **kwargs,
         )
 
     @staticmethod
-    def _parse_confusion_matrix(confusion_matrix, classes, ytrue_ids_dict, ypred_ids_dict):
+    def _parse_confusion_matrix(
+        confusion_matrix,
+        classes,
+        weights_dict=None,
+        ytrue_ids_dict=None,
+        ypred_ids_dict=None,
+    ):
+        have_ids = ytrue_ids_dict is not None and ypred_ids_dict is not None
+
         ytrue = []
         ypred = []
         weights = []
-        ytrue_ids = None
-        ypred_ids = None
-        if ytrue_ids_dict is not None and ypred_ids_dict is not None:
+        if have_ids:
             ytrue_ids = []
             ypred_ids = []
+        else:
+            ytrue_ids = None
+            ypred_ids = None
+
         nrows, ncols = confusion_matrix.shape
         for i in range(nrows):
             for j in range(ncols):
-                index = (i, j)
                 cij = confusion_matrix[i, j]
                 if cij > 0:
-                    if ytrue_ids_dict is not None and ypred_ids_dict is not None:
-                        ytrue_ids+= ytrue_ids_dict[index]
-                        ypred_ids+= ypred_ids_dict[index]
-                        ytrue_multiplier = len(ytrue_ids_dict[index])
+                    if have_ids:
+                        index = (i, j)
+                        classi = classes[i]
+                        classj = classes[j]
+                        for weight, ytrue_id, ypred_id in zip(
+                            weights_dict[index],
+                            ytrue_ids_dict[index],
+                            ypred_ids_dict[index],
+                        ):
+                            ytrue.append(classi)
+                            ypred.append(classj)
+                            weights.append(weight)
+                            ytrue_ids.append(ytrue_id)
+                            ypred_ids.append(ypred_id)
                     else:
-                        ytrue_multiplier = 1
-                    for p in range(ytrue_multiplier):
                         ytrue.append(classes[i])
                         ypred.append(classes[j])
-                        weights.append(cij/ytrue_multiplier)
-                    #Note: The weights aren't equally divided by the different ytrue_ids, but it works out for the confusion matrix calculations.
-                    
+                        weights.append(cij)
+
         return ytrue, ypred, weights, ytrue_ids, ypred_ids
 
+
+def _parse_index_dict(d):
+    import ast
+
+    return {ast.literal_eval(k): v for k, v in d.items()}
+
+
 def _parse_config(pred_field, gt_field, method, **kwargs):
     if method is None:
         method = fo.evaluation_config.default_segmentation_backend
@@ -630,10 +684,10 @@ def _compute_pixel_confusion_matrix(
     pred_mask, gt_mask, values, bandwidth=None
 ):
     if pred_mask.ndim == 3:
-        pred_mask = _rgb_array_to_int(pred_mask)
+        pred_mask = fof.rgb_array_to_int(pred_mask)
 
     if gt_mask.ndim == 3:
-        gt_mask = _rgb_array_to_int(gt_mask)
+        gt_mask = fof.rgb_array_to_int(gt_mask)
 
     if pred_mask.shape != gt_mask.shape:
         msg = (
@@ -706,37 +760,15 @@ def _get_mask_values(samples, pred_field, gt_field, progress=None):
                     mask = seg.get_mask()
                     if mask.ndim == 3:
                         is_rgb = True
-                        mask = _rgb_array_to_int(mask)
+                        mask = fof.rgb_array_to_int(mask)
 
                     values.update(mask.ravel())
 
     values = sorted(values)
 
     if is_rgb:
-        classes = [_int_to_hex(v) for v in values]
+        classes = [fof.int_to_hex(v) for v in values]
     else:
         classes = [str(v) for v in values]
 
     return values, classes
-
-
-def _rgb_array_to_int(mask):
-    return (
-        np.left_shift(mask[:, :, 0], 16, dtype=int)
-        + np.left_shift(mask[:, :, 1], 8, dtype=int)
-        + mask[:, :, 2]
-    )
-
-
-def _hex_to_int(hex_str):
-    r = int(hex_str[1:3], 16)
-    g = int(hex_str[3:5], 16)
-    b = int(hex_str[5:7], 16)
-    return (r << 16) + (g << 8) + b
-
-
-def _int_to_hex(value):
-    r = (value >> 16) & 255
-    g = (value >> 8) & 255
-    b = value & 255
-    return "#%02x%02x%02x" % (r, g, b)
diff --git a/plugins/panels/model_evaluation/__init__.py b/plugins/panels/model_evaluation/__init__.py
index 35e850d6415..e31d96b4f6b 100644
--- a/plugins/panels/model_evaluation/__init__.py
+++ b/plugins/panels/model_evaluation/__init__.py
@@ -10,9 +10,11 @@
 import os
 import traceback
 
+from bson import ObjectId
 import numpy as np
 
 from fiftyone import ViewField as F
+import fiftyone.core.fields as fof
 from fiftyone.operators.categories import Categories
 from fiftyone.operators.panel import Panel, PanelConfig
 from fiftyone.core.plots.plotly import _to_log_colorscale
@@ -331,6 +333,26 @@ def get_mask_targets(self, dataset, gt_field):
 
         return None
 
+    def get_classes_map(self, dataset, results, gt_field):
+        classes = results.classes
+
+        #
+        # `results.classes` could contain any of the following:
+        #  1. stringified pixel values
+        #  2. RGB hex strings
+        #  3. label strings
+        #
+        # If mask targets are available, then App callbacks will use label
+        # strings, so we convert to label strings here
+        #
+        mask_targets = self.get_mask_targets(dataset, gt_field)
+        if mask_targets is not None:
+            # `str()` handles cases 1 and 2, and `.get(c, c)` handles case 3
+            mask_targets = {str(k): v for k, v in mask_targets.items()}
+            classes = [mask_targets.get(c, c) for c in classes]
+
+        return {c: i for i, c in enumerate(classes)}
+
     def load_evaluation(self, ctx):
         view_state = ctx.panel.get_state("view") or {}
         eval_key = view_state.get("key")
@@ -480,15 +502,18 @@ def load_view(self, ctx):
         info = ctx.dataset.get_evaluation_info(eval_key)
         pred_field = info.config.pred_field
         gt_field = info.config.gt_field
+        mask_targets = self.get_mask_targets(ctx.dataset, gt_field)
 
         eval_key2 = view_state.get("compareKey", None)
         pred_field2 = None
         gt_field2 = None
+        mask_targets2 = mask_targets
         if eval_key2:
             info2 = ctx.dataset.get_evaluation_info(eval_key2)
             pred_field2 = info2.config.pred_field
             if info2.config.gt_field != gt_field:
                 gt_field2 = info2.config.gt_field
+                mask_targets2 = self.get_mask_targets(ctx.dataset, gt_field2)
 
         x = view_options.get("x", None)
         y = view_options.get("y", None)
@@ -592,6 +617,89 @@ def load_view(self, ctx):
                     view = eval_view.filter_labels(
                         pred_field, F(eval_key) == field, only_matches=True
                     )
+        elif info.config.type == "segmentation":
+            results = ctx.dataset.load_evaluation_results(eval_key)
+            classes_map = self.get_classes_map(ctx.dataset, results, gt_field)
+            if (
+                results.ytrue_ids_dict is None
+                or results.ypred_ids_dict is None
+            ):
+                # legacy segmentation evaluation
+                return
+
+            if eval_key2:
+                if gt_field2 is None:
+                    gt_field2 = gt_field
+
+                results2 = ctx.dataset.load_evaluation_results(eval_key2)
+                classes_map2 = self.get_classes_map(
+                    ctx.dataset, results2, gt_field2
+                )
+                if (
+                    results2.ytrue_ids_dict is None
+                    or results2.ypred_ids_dict is None
+                ):
+                    # legacy segmentation evaluation
+                    return
+            else:
+                results2 = None
+
+            _, gt_id = ctx.dataset._get_label_field_path(gt_field, "_id")
+            _, pred_id = ctx.dataset._get_label_field_path(pred_field, "_id")
+            if gt_field2 is not None:
+                _, gt_id2 = ctx.dataset._get_label_field_path(gt_field2, "_id")
+            if pred_field2 is not None:
+                _, pred_id2 = ctx.dataset._get_label_field_path(
+                    pred_field2, "_id"
+                )
+
+            if view_type == "class":
+                # All GT/predictions that contain class `x`
+                k = classes_map[x]
+                ytrue_ids, ypred_ids = _get_ids_slice(results, k)
+                expr = F(gt_id).is_in(ytrue_ids)
+                expr |= F(pred_id).is_in(ypred_ids)
+                if results2 is not None:
+                    k2 = classes_map2[x]
+                    ytrue_ids2, ypred_ids2 = _get_ids_slice(results2, k2)
+                    expr |= F(gt_id2).is_in(ytrue_ids2)
+                    expr |= F(pred_id2).is_in(ypred_ids2)
+
+                view = eval_view.match(expr)
+            elif view_type == "matrix":
+                # Specific confusion matrix cell
+                i = classes_map[x]
+                j = classes_map[y]
+                ytrue_ids = _to_object_ids(
+                    results.ytrue_ids_dict.get((i, j), [])
+                )
+                ypred_ids = _to_object_ids(
+                    results.ypred_ids_dict.get((i, j), [])
+                )
+                expr = F(gt_id).is_in(ytrue_ids)
+                expr &= F(pred_id).is_in(ypred_ids)
+                view = eval_view.match(expr)
+            elif view_type == "field":
+                if field == "tp":
+                    # All true positives
+                    inds = results.ytrue == results.ypred
+                    ytrue_ids = _to_object_ids(results.ytrue_ids[inds])
+                    ypred_ids = _to_object_ids(results.ypred_ids[inds])
+                    expr = F(gt_id).is_in(ytrue_ids)
+                    expr &= F(pred_id).is_in(ypred_ids)
+                    view = eval_view.match(expr)
+                elif field == "fn":
+                    # All false negatives
+                    inds = results.ypred == missing
+                    ytrue_ids = _to_object_ids(results.ytrue_ids[inds])
+                    expr = F(gt_id).is_in(ytrue_ids)
+                    view = eval_view.match(expr)
+                else:
+                    # All false positives
+                    inds = results.ytrue == missing
+                    ypred_ids = _to_object_ids(results.ypred_ids[inds])
+                    expr = F(pred_id).is_in(ypred_ids)
+                    view = eval_view.match(expr)
 
         if view is not None:
             ctx.ops.set_view(view)
@@ -612,3 +720,25 @@ def render(self, ctx):
                 load_view=self.load_view,
             ),
         )
+
+
+def _to_object_ids(ids):
+    return [ObjectId(_id) for _id in ids]
+
+
+def _get_ids_slice(results, k):
+    nrows, ncols = results.pixel_confusion_matrix.shape
+
+    ytrue_ids = []
+    for j in range(ncols):
+        _ytrue_ids = results.ytrue_ids_dict.get((k, j), None)
+        if _ytrue_ids is not None:
+            ytrue_ids.extend(_ytrue_ids)
+
+    ypred_ids = []
+    for i in range(nrows):
+        _ypred_ids = results.ypred_ids_dict.get((i, k), None)
+        if _ypred_ids is not None:
+            ypred_ids.extend(_ypred_ids)
+
+    return _to_object_ids(ytrue_ids), _to_object_ids(ypred_ids)

From 5b17db118b3c007682e775eb6051b3558d1beeb5 Mon Sep 17 00:00:00 2001
From: brimoor <brimoor@umich.edu>
Date: Tue, 31 Dec 2024 19:31:50 -0500
Subject: [PATCH 4/6] store matches instead

---
 fiftyone/utils/eval/segmentation.py         | 151 ++++++-----------
 plugins/panels/model_evaluation/__init__.py | 179 ++++++++++++--------
 2 files changed, 163 insertions(+), 167 deletions(-)

diff --git a/fiftyone/utils/eval/segmentation.py b/fiftyone/utils/eval/segmentation.py
index 68196888f30..7bebee10f84 100644
--- a/fiftyone/utils/eval/segmentation.py
+++ b/fiftyone/utils/eval/segmentation.py
@@ -8,6 +8,7 @@
 from copy import deepcopy
 import logging
 import inspect
+import itertools
 import warnings
 
 import numpy as np
@@ -385,9 +386,7 @@ def evaluate_samples(
 
         nc = len(values)
         confusion_matrix = np.zeros((nc, nc), dtype=int)
-        weights_dict = {}
-        ytrue_ids_dict = {}
-        ypred_ids_dict = {}
+        matches = []
 
         bandwidth = self.config.bandwidth
         average = self.config.average
@@ -430,18 +429,16 @@ def evaluate_samples(
                 )
                 sample_conf_mat += image_conf_mat
 
-                for index in zip(*np.nonzero(image_conf_mat)):
-                    if index not in weights_dict:
-                        weights_dict[index] = []
-                    weights_dict[index].append(int(image_conf_mat[index]))
-
-                    if index not in ytrue_ids_dict:
-                        ytrue_ids_dict[index] = []
-                    ytrue_ids_dict[index].append(gt_seg.id)
-
-                    if index not in ypred_ids_dict:
-                        ypred_ids_dict[index] = []
-                    ypred_ids_dict[index].append(pred_seg.id)
+                for i, j in zip(*np.nonzero(image_conf_mat)):
+                    matches.append(
+                        (
+                            classes[i],
+                            classes[j],
+                            int(image_conf_mat[i, j]),
+                            gt_seg.id,
+                            pred_seg.id,
+                        )
+                    )
 
                 if processing_frames and save:
                     facc, fpre, frec = _compute_accuracy_precision_recall(
@@ -476,9 +473,7 @@ def evaluate_samples(
             eval_key,
             confusion_matrix,
             classes,
-            weights_dict=weights_dict,
-            ytrue_ids_dict=ytrue_ids_dict,
-            ypred_ids_dict=ypred_ids_dict,
+            matches=matches,
             missing=missing,
             backend=self,
         )
@@ -493,11 +488,9 @@ class SegmentationResults(BaseClassificationResults):
         eval_key: the evaluation key
         pixel_confusion_matrix: a pixel value confusion matrix
         classes: a list of class labels corresponding to the confusion matrix
-        weights_dict (None): a dict mapping ``(i, j)`` tuples to pixel counts
-        ytrue_ids_dict (None): a dict mapping ``(i, j)`` tuples to lists of
-            ground truth IDs
-        ypred_ids_dict (None): a dict mapping ``(i, j)`` tuples to lists of
-            predicted label IDs
+        matches (None): a list of
+            ``(gt_label, pred_label, pixel_count, gt_id, pred_id)``
+            matches
         missing (None): a missing (background) class
         custom_metrics (None): an optional dict of custom metrics
         backend (None): a :class:`SegmentationEvaluation` backend
@@ -510,28 +503,23 @@ def __init__(
         eval_key,
         pixel_confusion_matrix,
         classes,
-        weights_dict=None,
-        ytrue_ids_dict=None,
-        ypred_ids_dict=None,
+        matches=None,
         missing=None,
         custom_metrics=None,
         backend=None,
     ):
         pixel_confusion_matrix = np.asarray(pixel_confusion_matrix)
 
-        (
-            ytrue,
-            ypred,
-            weights,
-            ytrue_ids,
-            ypred_ids,
-        ) = self._parse_confusion_matrix(
-            pixel_confusion_matrix,
-            classes,
-            weights_dict=weights_dict,
-            ytrue_ids_dict=ytrue_ids_dict,
-            ypred_ids_dict=ypred_ids_dict,
-        )
+        if matches is None:
+            ytrue, ypred, weights = self._parse_confusion_matrix(
+                pixel_confusion_matrix, classes
+            )
+            ytrue_ids = None
+            ypred_ids = None
+        elif matches:
+            ytrue, ypred, weights, ytrue_ids, ypred_ids = zip(*matches)
+        else:
+            ytrue, ypred, weights, ytrue_ids, ypred_ids = [], [], [], [], []
 
         super().__init__(
             samples,
@@ -549,21 +537,6 @@ def __init__(
         )
 
         self.pixel_confusion_matrix = pixel_confusion_matrix
-        self.weights_dict = weights_dict
-        self.ytrue_ids_dict = ytrue_ids_dict
-        self.ypred_ids_dict = ypred_ids_dict
-
-    def attributes(self):
-        return [
-            "cls",
-            "pixel_confusion_matrix",
-            "classes",
-            "weights_dict",
-            "ytrue_ids_dict",
-            "ypred_ids_dict",
-            "missing",
-            "custom_metrics",
-        ]
 
     def dice_score(self):
         """Computes the Dice score across all samples in the evaluation.
@@ -575,71 +548,51 @@ def dice_score(self):
 
     @classmethod
     def _from_dict(cls, d, samples, config, eval_key, **kwargs):
+        ytrue = d.get("ytrue", None)
+        ypred = d.get("ypred", None)
+        weights = d.get("weights", None)
+        ytrue_ids = d.get("ytrue_ids", None)
+        ypred_ids = d.get("ypred_ids", None)
+
+        if ytrue is not None and ypred is not None and weights is not None:
+            if ytrue_ids is None:
+                ytrue_ids = itertools.repeat(None)
+
+            if ypred_ids is None:
+                ypred_ids = itertools.repeat(None)
+
+            matches = list(zip(ytrue, ypred, weights, ytrue_ids, ypred_ids))
+        else:
+            # Legacy format segmentations
+            matches = None
+
         return cls(
             samples,
             config,
             eval_key,
             d["pixel_confusion_matrix"],
             d["classes"],
-            weights_dict=_parse_index_dict(d.get("weights_dict", None)),
-            ytrue_ids_dict=_parse_index_dict(d.get("ytrue_ids_dict", None)),
-            ypred_ids_dict=_parse_index_dict(d.get("ypred_ids_dict", None)),
+            matches=matches,
             missing=d.get("missing", None),
             custom_metrics=d.get("custom_metrics", None),
             **kwargs,
         )
 
     @staticmethod
-    def _parse_confusion_matrix(
-        confusion_matrix,
-        classes,
-        weights_dict=None,
-        ytrue_ids_dict=None,
-        ypred_ids_dict=None,
-    ):
-        have_ids = ytrue_ids_dict is not None and ypred_ids_dict is not None
-
+    def _parse_confusion_matrix(confusion_matrix, classes):
         ytrue = []
         ypred = []
         weights = []
-        if have_ids:
-            ytrue_ids = []
-            ypred_ids = []
-        else:
-            ytrue_ids = None
-            ypred_ids = None
-
         nrows, ncols = confusion_matrix.shape
         for i in range(nrows):
             for j in range(ncols):
                 cij = confusion_matrix[i, j]
                 if cij > 0:
-                    if have_ids:
-                        index = (i, j)
-                        classi = classes[i]
-                        classj = classes[j]
-                        for weight, ytrue_id, ypred_id in zip(
-                            weights_dict[index],
-                            ytrue_ids_dict[index],
-                            ypred_ids_dict[index],
-                        ):
-                            ytrue.append(classi)
-                            ypred.append(classj)
-                            weights.append(weight)
-                            ytrue_ids.append(ytrue_id)
-                            ypred_ids.append(ypred_id)
-                    else:
-                        ytrue.append(classes[i])
-                        ypred.append(classes[j])
-                        weights.append(cij)
-
-        return ytrue, ypred, weights, ytrue_ids, ypred_ids
-
-
-def _parse_index_dict(d):
-    import ast
-
-    return {ast.literal_eval(k): v for k, v in d.items()}
+                    ytrue.append(classes[i])
+                    ypred.append(classes[j])
+                    weights.append(cij)
+
+        return ytrue, ypred, weights
 
 
 def _parse_config(pred_field, gt_field, method, **kwargs):
diff --git a/plugins/panels/model_evaluation/__init__.py b/plugins/panels/model_evaluation/__init__.py
index e31d96b4f6b..0f2242dab89 100644
--- a/plugins/panels/model_evaluation/__init__.py
+++ b/plugins/panels/model_evaluation/__init__.py
@@ -333,26 +333,6 @@ def get_mask_targets(self, dataset, gt_field):
 
         return None
 
-    def get_classes_map(self, dataset, results, gt_field):
-        classes = results.classes
-
-        #
-        # `results.classes` could contain any of the following:
-        #  1. stringified pixel values
-        #  2. RGB hex strings
-        #  3. label strings
-        #
-        # If mask targets are available, then App callbacks will use label
-        # strings, so we convert to label strings here
-        #
-        mask_targets = self.get_mask_targets(dataset, gt_field)
-        if mask_targets is not None:
-            # `str()` handles cases 1 and 2, and `.get(c, c)` handles case 3
-            mask_targets = {str(k): v for k, v in mask_targets.items()}
-            classes = [mask_targets.get(c, c) for c in classes]
-
-        return {c: i for i, c in enumerate(classes)}
-
     def load_evaluation(self, ctx):
         view_state = ctx.panel.get_state("view") or {}
         eval_key = view_state.get("key")
@@ -373,13 +353,15 @@ def load_evaluation(self, ctx):
                     {"error": "unsupported", "info": serialized_info},
                 )
                 return
-            gt_field = info.config.gt_field
-            mask_targets = (
-                self.get_mask_targets(ctx.dataset, gt_field)
-                if evaluation_type == "segmentation"
-                else None
-            )
+
             results = ctx.dataset.load_evaluation_results(computed_eval_key)
+            gt_field = info.config.gt_field
+            mask_targets = None
+
+            if evaluation_type == "segmentation":
+                mask_targets = self.get_mask_targets(ctx.dataset, gt_field)
+                _init_segmentation_results(results, mask_targets)
+
             metrics = results.metrics()
             per_class_metrics = self.get_per_class_metrics(info, results)
             metrics["average_confidence"] = self.get_avg_confidence(
@@ -502,18 +484,15 @@ def load_view(self, ctx):
         info = ctx.dataset.get_evaluation_info(eval_key)
         pred_field = info.config.pred_field
         gt_field = info.config.gt_field
-        mask_targets = self.get_mask_targets(ctx.dataset, gt_field)
 
         eval_key2 = view_state.get("compareKey", None)
         pred_field2 = None
         gt_field2 = None
-        mask_targets2 = mask_targets
         if eval_key2:
             info2 = ctx.dataset.get_evaluation_info(eval_key2)
             pred_field2 = info2.config.pred_field
             if info2.config.gt_field != gt_field:
                 gt_field2 = info2.config.gt_field
-                mask_targets2 = self.get_mask_targets(ctx.dataset, gt_field2)
 
         x = view_options.get("x", None)
         y = view_options.get("y", None)
@@ -619,12 +598,8 @@ def load_view(self, ctx):
                     )
         elif info.config.type == "segmentation":
             results = ctx.dataset.load_evaluation_results(eval_key)
-            classes_map = self.get_classes_map(ctx.dataset, results, gt_field)
-            if (
-                results.ytrue_ids_dict is None
-                or results.ypred_ids_dict is None
-            ):
-                # legacy segmentation evaluation
+            if results.ytrue_ids is None or results.ypred_ids is None:
+                # Legacy format segmentations
                 return
 
             if eval_key2:
@@ -632,14 +607,8 @@ def load_view(self, ctx):
                     gt_field2 = gt_field
 
                 results2 = ctx.dataset.load_evaluation_results(eval_key2)
-                classes_map2 = self.get_classes_map(
-                    ctx.dataset, results2, gt_field2
-                )
-                if (
-                    results2.ytrue_ids_dict is None
-                    or results2.ypred_ids_dict is None
-                ):
-                    # legacy segmentation evaluation
+                if results2.ytrue_ids is None or results2.ypred_ids is None:
+                    # Legacy format segmentations
                     return
             else:
                 results2 = None
@@ -655,26 +624,21 @@ def load_view(self, ctx):
 
             if view_type == "class":
                 # All GT/predictions that contain class `x`
-                k = classes_map[x]
-                ytrue_ids, ypred_ids = _get_ids_slice(results, k)
+                ytrue_ids, ypred_ids = _get_segmentation_class_ids(results, x)
                 expr = F(gt_id).is_in(ytrue_ids)
                 expr |= F(pred_id).is_in(ypred_ids)
                 if results2 is not None:
-                    k2 = classes_map2[x]
-                    ytrue_ids2, ypred_ids2 = _get_ids_slice(results2, k2)
+                    ytrue_ids2, ypred_ids2 = _get_segmentation_class_ids(
+                        results2, x
+                    )
                     expr |= F(gt_id2).is_in(ytrue_ids2)
                     expr |= F(pred_id2).is_in(ypred_ids2)
 
                 view = eval_view.match(expr)
             elif view_type == "matrix":
                 # Specific confusion matrix cell
-                i = classes_map[x]
-                j = classes_map[y]
-                ytrue_ids = _to_object_ids(
-                    results.ytrue_ids_dict.get((i, j), [])
-                )
-                ypred_ids = _to_object_ids(
-                    results.ypred_ids_dict.get((i, j), [])
+                ytrue_ids, ypred_ids = _get_segmentation_conf_mat_ids(
+                    results, x, y
                 )
                 expr = F(gt_id).is_in(ytrue_ids)
                 expr &= F(pred_id).is_in(ypred_ids)
@@ -682,22 +646,24 @@ def load_view(self, ctx):
             elif view_type == "field":
                 if field == "tp":
                     # All true positives
-                    inds = results.ytrue == results.ypred
-                    ytrue_ids = _to_object_ids(results.ytrue_ids[inds])
-                    ypred_ids = _to_object_ids(results.ypred_ids[inds])
+                    ytrue_ids, ypred_ids = _get_segmentation_tp_fp_fn_ids(
+                        results, field
+                    )
                     expr = F(gt_id).is_in(ytrue_ids)
                     expr &= F(pred_id).is_in(ypred_ids)
                     view = eval_view.match(expr)
                 elif field == "fn":
                     # All false negatives
-                    inds = results.ypred == missing
-                    ytrue_ids = _to_object_ids(results.ytrue_ids[inds])
+                    ytrue_ids, _ = _get_segmentation_tp_fp_fn_ids(
+                        results, field
+                    )
                     expr = F(gt_id).is_in(ytrue_ids)
                     view = eval_view.match(expr)
                 else:
                     # All false positives
-                    inds = results.ytrue == missing
-                    ypred_ids = _to_object_ids(results.ypred_ids[inds])
+                    _, ypred_ids = _get_segmentation_tp_fp_fn_ids(
+                        results, field
+                    )
                     expr = F(pred_id).is_in(ypred_ids)
                     view = eval_view.match(expr)
 
@@ -722,23 +688,100 @@ def render(self, ctx):
         )
 
 
-def _to_object_ids(ids):
-    return [ObjectId(_id) for _id in ids]
-
-
-def _get_ids_slice(results, k):
+def _init_segmentation_results(results, mask_targets):
+    if results.ytrue_ids is None or results.ypred_ids is None:
+        # Legacy format segmentations
+        return
+
+    #
+    # `results.classes` and App callbacks could contain any of the
+    # following:
+    #  1. stringified pixel values
+    #  2. RGB hex strings
+    #  3. label strings
+    #
+    # so we must construct `classes_map` that can map any of these possible
+    # values to integer indexes
+    #
+    classes_map = {c: i for i, c in enumerate(results.classes)}
+
+    if mask_targets is not None:
+        # `str()` handles cases 1 and 2, and `.get(c, c)` handles case 3
+        mask_targets = {str(k): v for k, v in mask_targets.items()}
+        classes = [mask_targets.get(c, c) for c in results.classes]
+        classes_map.update({c: i for i, c in enumerate(classes)})
+
+    #
+    # Generate mapping from `(i, j)` to ID lists for use in App callbacks
+    #
+
+    ytrue_ids_dict = {}
+    ypred_ids_dict = {}
+    for ytrue, ypred, ytrue_id, ypred_id in zip(
+        results.ytrue, results.ypred, results.ytrue_ids, results.ypred_ids
+    ):
+        i = classes_map[ytrue]
+        j = classes_map[ypred]
+        index = (i, j)
+
+        if index not in ytrue_ids_dict:
+            ytrue_ids_dict[index] = []
+        ytrue_ids_dict[index].append(ytrue_id)
+
+        if index not in ypred_ids_dict:
+            ypred_ids_dict[index] = []
+        ypred_ids_dict[index].append(ypred_id)
+
+    results._classes_map = classes_map
+    results._ytrue_ids_dict = ytrue_ids_dict
+    results._ypred_ids_dict = ypred_ids_dict
+
+
+def _get_segmentation_class_ids(results, x):
+    k = results._classes_map[x]
     nrows, ncols = results.pixel_confusion_matrix.shape
 
     ytrue_ids = []
     for j in range(ncols):
-        _ytrue_ids = results.ytrue_ids_dict.get((k, j), None)
+        _ytrue_ids = results._ytrue_ids_dict.get((k, j), None)
         if _ytrue_ids is not None:
             ytrue_ids.extend(_ytrue_ids)
 
     ypred_ids = []
     for i in range(nrows):
-        _ypred_ids = results.ypred_ids_dict.get((i, k), None)
+        _ypred_ids = results._ypred_ids_dict.get((i, k), None)
         if _ypred_ids is not None:
             ypred_ids.extend(_ypred_ids)
 
     return _to_object_ids(ytrue_ids), _to_object_ids(ypred_ids)
+
+
+def _get_segmentation_conf_mat_ids(results, x, y):
+    i = results._classes_map[x]
+    j = results._classes_map[y]
+    ytrue_ids = _to_object_ids(results._ytrue_ids_dict.get((i, j), []))
+    ypred_ids = _to_object_ids(results._ypred_ids_dict.get((i, j), []))
+    return ytrue_ids, ypred_ids
+
+
+def _get_segmentation_tp_fp_fn_ids(results, field):
+    if field == "tp":
+        # True positives
+        inds = results.ytrue == results.ypred
+        ytrue_ids = _to_object_ids(results.ytrue_ids[inds])
+        ypred_ids = _to_object_ids(results.ypred_ids[inds])
+        return ytrue_ids, ypred_ids
+    elif field == "fn":
+        # False negatives
+        inds = results.ypred == results.missing
+        ytrue_ids = _to_object_ids(results.ytrue_ids[inds])
+        return ytrue_ids, None
+    else:
+        # False positives
+        inds = results.ytrue == results.missing
+        ypred_ids = _to_object_ids(results.ypred_ids[inds])
+        return None, ypred_ids
+
+
+def _to_object_ids(ids):
+    return [ObjectId(_id) for _id in ids]

From 59d404446f110aee1113292bb927aac3928fc93d Mon Sep 17 00:00:00 2001
From: brimoor <brimoor@umich.edu>
Date: Fri, 10 Jan 2025 09:56:27 -0500
Subject: [PATCH 5/6] more robust initialization

---
 fiftyone/server/utils.py                    | 15 +++++++
 plugins/panels/model_evaluation/__init__.py | 43 ++++++++++++++-------
 2 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/fiftyone/server/utils.py b/fiftyone/server/utils.py
index 6104f35c73a..31448b45d8d 100644
--- a/fiftyone/server/utils.py
+++ b/fiftyone/server/utils.py
@@ -49,6 +49,21 @@ def load_and_cache_dataset(name):
     return dataset
 
 
+def cache_dataset(dataset):
+    """Caches the given dataset.
+
+    This method ensures that subsequent calls to
+    :func:`fiftyone.core.dataset.load_dataset` in async calls will return this
+    dataset singleton.
+
+    See :meth:`load_and_cache_dataset` for additional details.
+
+    Args:
+        dataset: a :class:`fiftyone.core.dataset.Dataset`
+    """
+    _cache[dataset.name] = dataset
+
+
 def change_sample_tags(sample_collection, changes):
     """Applies the changes to tags to all samples of the collection, if
     necessary.
diff --git a/plugins/panels/model_evaluation/__init__.py b/plugins/panels/model_evaluation/__init__.py
index 0f2242dab89..31849ff94e0 100644
--- a/plugins/panels/model_evaluation/__init__.py
+++ b/plugins/panels/model_evaluation/__init__.py
@@ -323,16 +323,6 @@ def get_confusion_matrices(self, results):
             "lc_colorscale": lc_colorscale,
         }
 
-    def get_mask_targets(self, dataset, gt_field):
-        mask_targets = dataset.mask_targets.get(gt_field, None)
-        if mask_targets:
-            return mask_targets
-
-        if dataset.default_mask_targets:
-            return dataset.default_mask_targets
-
-        return None
-
     def load_evaluation(self, ctx):
         view_state = ctx.panel.get_state("view") or {}
         eval_key = view_state.get("key")
@@ -359,8 +349,8 @@ def load_evaluation(self, ctx):
             mask_targets = None
 
             if evaluation_type == "segmentation":
-                mask_targets = self.get_mask_targets(ctx.dataset, gt_field)
-                _init_segmentation_results(results, mask_targets)
+                mask_targets = _get_mask_targets(ctx.dataset, gt_field)
+                _init_segmentation_results(ctx.dataset, results, gt_field)
 
             metrics = results.metrics()
             per_class_metrics = self.get_per_class_metrics(info, results)
@@ -598,6 +588,7 @@ def load_view(self, ctx):
                     )
         elif info.config.type == "segmentation":
             results = ctx.dataset.load_evaluation_results(eval_key)
+            _init_segmentation_results(ctx.dataset, results, gt_field)
             if results.ytrue_ids is None or results.ypred_ids is None:
                 # Legacy format segmentations
                 return
@@ -607,6 +598,7 @@ def load_view(self, ctx):
                     gt_field2 = gt_field
 
                 results2 = ctx.dataset.load_evaluation_results(eval_key2)
+                _init_segmentation_results(ctx.dataset, results2, gt_field2)
                 if results2.ytrue_ids is None or results2.ypred_ids is None:
                     # Legacy format segmentations
                     return
@@ -688,11 +680,35 @@ def render(self, ctx):
         )
 
 
-def _init_segmentation_results(results, mask_targets):
+def _get_mask_targets(dataset, gt_field):
+    mask_targets = dataset.mask_targets.get(gt_field, None)
+    if mask_targets:
+        return mask_targets
+
+    if dataset.default_mask_targets:
+        return dataset.default_mask_targets
+
+    return None
+
+
+def _init_segmentation_results(dataset, results, gt_field):
     if results.ytrue_ids is None or results.ypred_ids is None:
         # Legacy format segmentations
         return
 
+    if getattr(results, "_classes_map", None):
+        # Already initialized
+        return
+
+    #
+    # Ensure the dataset singleton is cached so that subsequent callbacks on
+    # this panel will use the same `dataset` and hence `results`
+    #
+
+    import fiftyone.server.utils as fosu
+
+    fosu.cache_dataset(dataset)
+
     #
     # `results.classes` and App callbacks could contain any of the
     # following:
@@ -705,6 +721,7 @@ def _init_segmentation_results(results, mask_targets):
     #
     classes_map = {c: i for i, c in enumerate(results.classes)}
 
+    mask_targets = _get_mask_targets(dataset, gt_field)
     if mask_targets is not None:
         # `str()` handles cases 1 and 2, and `.get(c, c)` handles case 3
         mask_targets = {str(k): v for k, v in mask_targets.items()}

From 3d11a021cf760c9cbe03026f83812b8b9d2a7ee3 Mon Sep 17 00:00:00 2001
From: brimoor <brimoor@umich.edu>
Date: Fri, 24 Jan 2025 00:20:35 -0500
Subject: [PATCH 6/6] handle missing custom metrics

---
 .../components/NativeModelEvaluationView/Evaluation.tsx    | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/app/packages/core/src/plugins/SchemaIO/components/NativeModelEvaluationView/Evaluation.tsx b/app/packages/core/src/plugins/SchemaIO/components/NativeModelEvaluationView/Evaluation.tsx
index f9c6ade6d4b..75dfa5e3b43 100644
--- a/app/packages/core/src/plugins/SchemaIO/components/NativeModelEvaluationView/Evaluation.tsx
+++ b/app/packages/core/src/plugins/SchemaIO/components/NativeModelEvaluationView/Evaluation.tsx
@@ -1783,11 +1783,8 @@ type SummaryRow = {
 
 function formatCustomMetricRows(evaluationMetrics, comparisonMetrics) {
   const results = [] as SummaryRow[];
-  const customMetrics = _.get(
-    evaluationMetrics,
-    "custom_metrics",
-    {}
-  ) as CustomMetrics;
+  const customMetrics = (_.get(evaluationMetrics, "custom_metrics", null) ||
+    {}) as CustomMetrics;
   for (const [operatorUri, customMetric] of Object.entries(customMetrics)) {
     const compareValue = _.get(
       comparisonMetrics,