From 762314ed8793f8609bee4992baba01270dbb9067 Mon Sep 17 00:00:00 2001 From: Mike VanDenburgh Date: Wed, 13 Nov 2024 22:00:09 -0500 Subject: [PATCH] Optimize sites list query Moves the color_code subquery to a separate query to avoid a slow subquery in the main query, due to the EvaluationBroadAreaSearchDetection table being extremely large. The number of sites in an evaluation run is small enough that it's feasible to load the site_id->color_code mapping into memory and manually join it with the sites_list in Python. --- rdwatch/scoring/views/model_run.py | 119 +++++++++++++++-------------- 1 file changed, 60 insertions(+), 59 deletions(-) diff --git a/rdwatch/scoring/views/model_run.py b/rdwatch/scoring/views/model_run.py index 7f25fe5fd..9e1f0b946 100644 --- a/rdwatch/scoring/views/model_run.py +++ b/rdwatch/scoring/views/model_run.py @@ -697,6 +697,7 @@ def get_sites_query(model_run_id: UUID4): sites=JSONBAgg( JSONObject( id='pk', + site_id='site_id', number=Substr(F('site_id'), 9, 4), # pos is 1 indexed, bbox=BoundingBox( Func( @@ -719,65 +720,6 @@ def get_sites_query(model_run_id: UUID4): ), end_date=Coalesce(ExtractEpoch('end_date'), ExtractEpoch('point_date')), originator=F('originator'), - color_code=Case( - When( - Q(originator='te') | Q(originator='iMERIT'), - then=Subquery( - EvaluationBroadAreaSearchDetection.objects.filter( - Q(evaluation_run_uuid=model_run_id) - & Q(activity_type='overall') - & Q(rho=0.5) - & Q(tau=0.2) - & Q(min_confidence_score=0.0) - & Q(site_truth=OuterRef('site_id')) - & Q( - Q(min_spatial_distance_threshold__isnull=True) - | Q(min_spatial_distance_threshold=100.0) - ) - & Q( - Q(central_spatial_distance_threshold__isnull=True) - | Q(central_spatial_distance_threshold=500.0) - ) - & Q(max_spatial_distance_threshold__isnull=True) - & Q( - Q(min_temporal_distance_threshold__isnull=True) - | Q(min_temporal_distance_threshold=730.0) - ) - & Q(central_temporal_distance_threshold__isnull=True) - & Q(max_temporal_distance_threshold__isnull=True) - ).values('color_code') - ), - ), - When( - ~Q(originator='te') & ~Q(originator='iMERIT'), - then=Subquery( - EvaluationBroadAreaSearchProposal.objects.filter( - Q(evaluation_run_uuid=model_run_id) - & Q(activity_type='overall') - & Q(rho=0.5) - & Q(tau=0.2) - & Q(min_confidence_score=0.0) - & Q(site_proposal=OuterRef('site_id')) - & Q( - Q(min_spatial_distance_threshold__isnull=True) - | Q(min_spatial_distance_threshold=100.0) - ) - & Q( - Q(central_spatial_distance_threshold__isnull=True) - | Q(central_spatial_distance_threshold=500.0) - ) - & Q(max_spatial_distance_threshold__isnull=True) - & Q( - Q(min_temporal_distance_threshold__isnull=True) - | Q(min_temporal_distance_threshold=730.0) - ) - & Q(central_temporal_distance_threshold__isnull=True) - & Q(max_temporal_distance_threshold__isnull=True) - ).values('color_code') - ), - ), - default=Value(None), # Set an appropriate default value here - ), ), ordering='site_id', default=[], @@ -804,6 +746,62 @@ def get_sites_query(model_run_id: UUID4): image_info = {i['site']: i for i in image_queryset} + # Build a mapping of site_id to color code. + # We do this in a separate query to avoid a slow subquery in the main query, + # due to the EvaluationBroadAreaSearchDetection table being extremely large. + # The number of sites in an evaluation run is small enough that it's feasible + # to load the site_id->color_code mapping into memory and manually join it + # with the sites_list in Python. + site_to_color_code_mapping: dict[str, int] = { + obj['site_truth']: obj['color_code'] + for obj in EvaluationBroadAreaSearchDetection.objects.filter( + Q(evaluation_run_uuid=model_run_id) + & Q(activity_type='overall') + & Q(rho=0.5) + & Q(tau=0.2) + & Q(min_confidence_score=0.0) + & Q( + Q(min_spatial_distance_threshold__isnull=True) + | Q(min_spatial_distance_threshold=100.0) + ) + & Q( + Q(central_spatial_distance_threshold__isnull=True) + | Q(central_spatial_distance_threshold=500.0) + ) + & Q(max_spatial_distance_threshold__isnull=True) + & Q( + Q(min_temporal_distance_threshold__isnull=True) + | Q(min_temporal_distance_threshold=730.0) + ) + & Q(central_temporal_distance_threshold__isnull=True) + & Q(max_temporal_distance_threshold__isnull=True) + ).values('site_truth', 'color_code') + } | { + obj['site_proposal']: obj['color_code'] + for obj in EvaluationBroadAreaSearchProposal.objects.filter( + Q(evaluation_run_uuid=model_run_id) + & Q(activity_type='overall') + & Q(rho=0.5) + & Q(tau=0.2) + & Q(min_confidence_score=0.0) + & Q( + Q(min_spatial_distance_threshold__isnull=True) + | Q(min_spatial_distance_threshold=100.0) + ) + & Q( + Q(central_spatial_distance_threshold__isnull=True) + | Q(central_spatial_distance_threshold=500.0) + ) + & Q(max_spatial_distance_threshold__isnull=True) + & Q( + Q(min_temporal_distance_threshold__isnull=True) + | Q(min_temporal_distance_threshold=730.0) + ) + & Q(central_temporal_distance_threshold__isnull=True) + & Q(max_temporal_distance_threshold__isnull=True) + ).values('site_proposal', 'color_code') + } + for s in site_list['sites']: if s['id'] in image_info.keys(): site_image_info = image_info[s['id']] @@ -821,6 +819,9 @@ def get_sites_query(model_run_id: UUID4): s['L8'] = site_image_info['L8'] s['PL'] = site_image_info['PL'] s['downloading'] = site_image_info['downloading'] + + s['color_code'] = site_to_color_code_mapping.get(s['site_id']) + return site_list