rungalileo · dcaustin33 · May 12, 2023 · May 11, 2023 · May 11, 2023 · May 11, 2023
@@ -179,4 +179,5 @@ coco.ipynb
 
 # SemSeg
 CV_datasets/
-coco_hf_dataset.py
+coco_hf_dataset.py
+coco_deeplab_hooks.ipynb
@@ -322,6 +322,7 @@ def _on_step_end(self) -> None:
             # do not log if we are not in the final inference loop
             if not self.called_finish:
                 return
+            print("logging")
             logger = SemanticSegmentationModelLogger(
                 bucket_name=self.bucket_name,
                 image_paths=image_paths,

@@ -1,4 +1,4 @@
-from typing import Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
 import torch
@@ -9,8 +9,10 @@
     semantic_segmentation_logger_config,
 )
 from dataquality.loggers.model_logger.base_model_logger import BaseGalileoModelLogger
+from dataquality.schemas.semantic_segmentation import Polygon
 from dataquality.schemas.split import Split
 from dataquality.utils.semantic_segmentation.errors import (
+    calculate_dep_polygons_batch,
     calculate_misclassified_polygons_batch,
     calculate_undetected_polygons_batch,
 )
@@ -104,6 +106,66 @@ def dep_path(self) -> str:
     def contours_path(self) -> str:
         return f"{self.proj_run}/{self.split_name_path}/contours"
 
+    def get_polygon_data(
+        self,
+        pred_polygons_batch: List[List[Polygon]],
+        gold_polygons_batch: List[List[Polygon]],
+    ) -> Dict[str, Any]:
+        """Returns polygon data for a batch of images in a dictionary
+        that can then be used for our polygon df
+
+        Args:
+            pred_polygons_batch (Tuple[List, List]): polygon data for predictions
+                in a minibatch of images
+            gold_polygons_batch (Tuple[List, List]): polygon  data for ground truth
+                in a minibatch of images
+
+        Returns:
+            Dict[str, Any]: a dict that can be used to create a polygon df
+        """
+        image_ids = []
+        polygon_ids = []
+        preds = []
+        golds = []
+        data_error_potentials = []
+        errors = []
+        for i, image_id in enumerate(self.image_ids):
+            pred_polygons = pred_polygons_batch[i]
+            for polygon in pred_polygons:
+                image_ids.append(image_id)
+                preds.append(polygon.label_idx)
+                golds.append(-1)
+                data_error_potentials.append(polygon.data_error_potential)
+                errors.append(polygon.error_type.value)
+                upload_polygon_contours(
+                    polygon, self.logger_config.polygon_idx, self.contours_path
+                )
+                polygon_ids.append(self.logger_config.polygon_idx)
+                self.logger_config.polygon_idx += 1
+            gold_polygons = gold_polygons_batch[i]
+            for polygon in gold_polygons:
+                image_ids.append(image_id)
+                preds.append(-1)
+                golds.append(polygon.label_idx)
+                data_error_potentials.append(polygon.data_error_potential)
+                errors.append(polygon.error_type.value)
+                upload_polygon_contours(
+                    polygon, self.logger_config.polygon_idx, self.contours_path
+                )
+                polygon_ids.append(self.logger_config.polygon_idx)
+                self.logger_config.polygon_idx += 1
+
+        polygon_data = {
+            "id": polygon_ids,
+            "image_id": image_ids,
+            "pred": preds,
+            "gold": golds,
+            "data_error_potential": data_error_potentials,
+            "galileo_error_type": errors,
+            "split": [self.split] * len(image_ids),
+        }
+        return polygon_data
+
     def _get_data_dict(self) -> Dict:
         """Returns a dictionary of data to be logged as a DataFrame"""
         # DEP & likely mislabeled
@@ -112,7 +174,7 @@ def _get_data_dict(self) -> Dict:
             self.mislabled_pixels, self.image_ids, prefix=self.lm_path
         )
 
-        image_dep = calculate_and_upload_dep(
+        image_dep, dep_heatmaps = calculate_and_upload_dep(
             self.output_probs,
             self.gold_masks,
             self.image_ids,
@@ -132,14 +194,23 @@ def _get_data_dict(self) -> Dict:
         # Errors
         calculate_misclassified_polygons_batch(self.pred_masks, gold_polygons_batch)
         calculate_undetected_polygons_batch(self.pred_masks, gold_polygons_batch)
+        heights = [img.shape[-1] for img in self.gold_masks]
+        widths = [img.shape[-2] for img in self.gold_masks]
+
+        calculate_dep_polygons_batch(
+            gold_polygons_batch,
+            dep_heatmaps.numpy(),
+            height=heights,
+            width=widths,
+        )
 
         image_data = {
             "image": [
                 f"{self.bucket_name}/{pth}" for pth in self.image_paths
             ],  # E.g. https://storage.googleapis.com/bucket_name/.../image_id.png
             "id": self.image_ids,
-            "height": [img.shape[-1] for img in self.gold_masks],
-            "width": [img.shape[-2] for img in self.gold_masks],
+            "height": heights,
+            "width": widths,
             "image_data_error_potential": image_dep,
             "mean_lm_score": [i for i in mean_mislabeled],
             "mean_iou": iou,
@@ -157,50 +228,11 @@ def _get_data_dict(self) -> Dict:
             meta=meta_keys,
         )
 
-        image_ids = []
-        polygon_ids = []
-        preds = []
-        golds = []
-        data_error_potentials = []
-        errors = []
-        for i, image_id in enumerate(self.image_ids):
-            pred_polygons = pred_polygons_batch[i]
-            for polygon in pred_polygons:
-                image_ids.append(image_id)
-                preds.append(polygon.label_idx)
-                golds.append(-1)
-                data_error_potentials.append(0.0)
-                errors.append(polygon.error_type.value)
-                upload_polygon_contours(
-                    polygon, self.logger_config.polygon_idx, self.contours_path
-                )
-                polygon_ids.append(self.logger_config.polygon_idx)
-                self.logger_config.polygon_idx += 1
-            gold_polygons = gold_polygons_batch[i]
-            for polygon in gold_polygons:
-                image_ids.append(image_id)
-                preds.append(-1)
-                golds.append(polygon.label_idx)
-                data_error_potentials.append(0.0)
-                errors.append(polygon.error_type.value)
-                upload_polygon_contours(
-                    polygon, self.logger_config.polygon_idx, self.contours_path
-                )
-                polygon_ids.append(self.logger_config.polygon_idx)
-                self.logger_config.polygon_idx += 1
-
-        polygon_data = {
-            "id": polygon_ids,
-            "image_id": image_ids,
-            "pred": preds,
-            "gold": golds,
-            "data_error_potential": data_error_potentials,
-            "galileo_error_type": errors,
-            "split": [self.split] * len(image_ids),
-        }
+        polygon_data = self.get_polygon_data(pred_polygons_batch, gold_polygons_batch)
+        n_polygons = polygon_data["image_id"]
         if self.split == Split.inference:
-            polygon_data["inference_name"] = [self.inference_name] * len(image_ids)
+            polygon_data["inference_name"] = [self.inference_name] * len(n_polygons)
         else:
-            polygon_data["epoch"] = [self.epoch] * len(image_ids)
+            polygon_data["epoch"] = [self.epoch] * len(n_polygons)
 
         return polygon_data
@@ -52,6 +52,7 @@ class Polygon(BaseModel):
     misclassified_class_label: Optional[int] = None
     error_type: ErrorType = ErrorType.none
     contours: List[Contour]
+    data_error_potential: Optional[float] = None
 
     @property
     def contours_opencv(self) -> List[np.ndarray]:

@@ -2,6 +2,7 @@
 
 import numpy as np
 import torch
+from PIL import Image
 
 from dataquality.schemas.semantic_segmentation import ErrorType, Polygon
 from dataquality.utils.semantic_segmentation.polygons import draw_polygon
@@ -155,3 +156,54 @@ def calculate_undetected_polygons_batch(
         pred_mask = pred_masks[idx].numpy()
         gold_polygons = gold_polygons_batch[idx]
         calculate_undetected_polygons(pred_mask, gold_polygons)
+
+
+def calculate_dep_polygon(
+    dep_map: np.ndarray,
+    polygon_img: np.ndarray,
+) -> float:
+    """Calculate the mean dep score for one polygon drawn onto an image of all
+    zero's. We can then take the polygon's dep score by only selecting those pixels
+    with a value greater than 0 and averageing them.
+
+    Args:
+        dep_map (np.ndarray): heatmap of dep scores for an image
+        polygon_img (np.ndarray): image of all zeros with a polygon drawn on it
+
+    Returns:
+        dep_score (float): mean dep score for the polygon
+    """
+    relevant_region = polygon_img != 0
+    dep_score = dep_map[relevant_region].mean()
+    return dep_score
+
+
+def calculate_dep_polygons_batch(
+    gold_polygons_batch: List[List[Polygon]],
+    dep_heatmaps: np.ndarray,
+    height: List[int],
+    width: List[int],
+) -> None:
+    """Takes the mean dep score within a polygon and sets the polygon's
+    dep score to the mean dep score
+
+    Args:
+        gold_polygons_batch (List[List[[Polygon]]): list of the gold polygons
+            for an image
+        dep_heatmaps (np.ndarray): heatmaps of DEP scores for an image
+        height (int): height of original image to resize the dep map to the correct
+            dims
+        width (int): width of original image to resize the dep map to the correct
+            dims
+    """
+    resized_dep_maps = []
+    for i, dep_map in enumerate(dep_heatmaps):
+        resized_image = Image.fromarray(dep_map).resize((width[i], height[i]))
+        resized_dep_maps.append(np.array(resized_image))
+
+    for idx in range(len(resized_dep_maps)):
+        dep_map = resized_dep_maps[idx]
+        gold_polygons = gold_polygons_batch[idx]
+        for polygon in gold_polygons:
+            polygon_img = draw_polygon(polygon, dep_map.shape)
+            polygon.data_error_potential = calculate_dep_polygon(dep_map, polygon_img)
@@ -19,16 +19,16 @@ def calculate_and_upload_dep(
     gold_masks: torch.Tensor,
     image_ids: List[int],
     obj_prefix: str,
-) -> List[float]:
+) -> Tuple[List[float], torch.Tensor]:
     """Calculates the Data Error Potential (DEP) for each image in the batch
 
     Uploads the heatmap to Minio as a png.
-    Returns the image DEP for each image in the batch.
+    Returns the image DEP for each image in the batch. As well as the dep_heatmaps.
         Image dep is calculated by the average pixel dep.
     """
     dep_heatmaps = calculate_dep_heatmaps(probs, gold_masks)
     upload_dep_heatmaps(dep_heatmaps, image_ids, obj_prefix)
-    return calculate_image_dep(dep_heatmaps)
+    return calculate_image_dep(dep_heatmaps), dep_heatmaps
 
 
 def calculate_dep_heatmaps(