From fd116533c80e9e9bd1863184b9ad509dd3c5579d Mon Sep 17 00:00:00 2001 From: Chris Helma <25470211+chelma@users.noreply.github.com> Date: Tue, 19 Nov 2024 13:54:09 -0600 Subject: [PATCH] Added initial CloudWatch Dashboard for RFS (#1147) * Added initial CloudWatch Dashboard for RFS Signed-off-by: Chris Helma * Fixed some linting issues in RFS CDK code Signed-off-by: Chris Helma * Minor updates per PR comments Signed-off-by: Chris Helma * Minor tweaks per team discussion Signed-off-by: Chris Helma * Added search for domain to RFS Dashboard Signed-off-by: Chris Helma * More updates per PR comments Signed-off-by: Chris Helma --------- Signed-off-by: Chris Helma --- .../reindex-from-snapshot-dashboard.json | 295 ++++++++++++++++++ .../reindex-from-snapshot-stack.ts | 46 ++- 2 files changed, 340 insertions(+), 1 deletion(-) create mode 100644 deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json diff --git a/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json b/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json new file mode 100644 index 000000000..c498b9808 --- /dev/null +++ b/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json @@ -0,0 +1,295 @@ +{ + "variables": [ + { + "type": "property", + "property": "region", + "inputType": "input", + "id": "REGION", + "label": "Region", + "defaultValue": "us-east-1", + "visible": false + }, + { + "type": "property", + "property": "DomainName", + "inputType": "select", + "id": "TC_DOMAIN_NAME", + "label": "Target Cluster Domain Name", + "search": "{AWS/ES,ClientId,DomainName} MetricName=\"CPUUtilization\"", + "populateFrom": "DomainName", + "defaultValue": "placeholder-name", + "visible": true + }, + { + "type": "pattern", + "pattern": "MA_STAGE", + "inputType": "input", + "id": "MA_STAGE", + "label": "Migration Assistant Stage", + "defaultValue": "placeholder-stage", + "visible": false + }, + { + "type": "pattern", + "pattern": "ACCOUNT_ID", + "inputType": "input", + "id": "ACCOUNT_ID", + "label": "Account ID", + "defaultValue": "ACCOUNT_ID", + "visible": false + } + ], + "widgets": [ + { + "height": 1, + "width": 24, + "y": 0, + "x": 0, + "type": "text", + "properties": { + "markdown": "# Target Cluster\n", + "background": "transparent" + } + }, + { + "height": 8, + "width": 12, + "y": 1, + "x": 0, + "type": "metric", + "properties": { + "view": "timeSeries", + "stacked": false, + "metrics": [ + [ { "expression": "METRICS()/1000/PERIOD(m1)*60", "id": "e1", "region": "REGION" } ], + [ "AWS/ES", "IndexingRate", "DomainName", "TC_DOMAIN_NAME", "ClientId", "ACCOUNT_ID", { "region": "region", "label": "Document Ingested (included replicas) - MIN: ${MIN}, MAX: ${MAX}, AVG: ${AVG}", "id": "m1", "visible": false } ] + ], + "region": "REGION", + "title": "Target Cluster Document Index Rate", + "yAxis": { + "left": { + "label": "Thousands", + "showUnits": false + } + }, + "period": 60, + "stat": "Sum" + } + }, + { + "height": 8, + "width": 12, + "y": 1, + "x": 12, + "type": "metric", + "properties": { + "metrics": [ + [ { "expression": "METRICS()/1000", "id": "e1", "region": "REGION" } ], + [ "AWS/ES", "SearchableDocuments", "DomainName", "TC_DOMAIN_NAME", "ClientId", "ACCOUNT_ID", { "region": "REGION", "label": "SearchableDocuments - MIN: ${MIN}, MAX ${MAX}", "id": "m1", "visible": false } ] + ], + "view": "timeSeries", + "stacked": false, + "region": "REGION", + "title": "Target Cluster SearchableDocuments", + "period": 60, + "stat": "Average", + "yAxis": { + "left": { + "label": "Thousands", + "showUnits": false + } + } + } + }, + { + "height": 8, + "width": 12, + "y": 9, + "x": 0, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/ES", "4xx", "DomainName", "TC_DOMAIN_NAME", "ClientId", "ACCOUNT_ID", { "region": "REGION", "label": "4xx - ${SUM}" } ], + [ ".", "3xx", ".", ".", ".", ".", { "region": "REGION", "label": "3xx - ${SUM}" } ], + [ ".", "2xx", ".", ".", ".", ".", { "region": "REGION", "label": "2xx - ${SUM}" } ], + [ ".", "5xx", ".", ".", ".", ".", { "region": "REGION", "label": "5xx - ${SUM}" } ] + ], + "view": "timeSeries", + "stacked": false, + "region": "REGION", + "stat": "Sum", + "period": 300, + "title": "Target Cluster Status Codes (per 5 minutes)" + } + }, + { + "height": 8, + "width": 12, + "y": 9, + "x": 12, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/ES", "CPUUtilization", "DomainName", "TC_DOMAIN_NAME", "ClientId", "ACCOUNT_ID", { "stat": "Minimum", "label": "Min Data Node CPU Utilization", "color": "#2ca02c", "region": "REGION" } ], + [ "...", { "stat": "Maximum", "label": "Max Data Node CPU Utilization", "color": "#d62728", "region": "REGION" } ], + [ "...", { "stat": "Average", "label": "Avg Data Node CPU Utilization", "color": "#1f77b4", "region": "REGION" } ], + [ "AWS/ES", "MasterCPUUtilization", "DomainName", "TC_DOMAIN_NAME", "ClientId", "ACCOUNT_ID", { "stat": "Minimum", "label": "Min Master Node CPU Utilization", "color": "#98df8a", "region": "REGION" } ], + [ "...", { "stat": "Maximum", "label": "Max Master Node CPU Utilization", "color": "#ff9896", "region": "REGION" } ], + [ "...", { "stat": "Average", "label": "Avg Master Node CPU Utilization", "color": "#ff7f0e", "region": "REGION" } ] + ], + "view": "timeSeries", + "stacked": false, + "region": "REGION", + "title": "Target Cluster CPU Utilization by Node", + "period": 60, + "yAxis": { + "left": { + "label": "CPU Utilization (%)", + "min": 0, + "max": 100, + "showUnits": false + } + }, + "legend": { + "position": "bottom" + } + } + }, + { + "height": 8, + "width": 12, + "y": 17, + "x": 0, + "type": "metric", + "properties": { + "metrics": [ + [ { "expression": "METRICS()/1000", "label": "", "id": "e1", "region": "REGION" } ], + [ "AWS/ES", "ClusterUsedSpace", "DomainName", "TC_DOMAIN_NAME", "ClientId", "ACCOUNT_ID", { "id": "m1", "visible": false, "period": 60, "region": "REGION" } ] + ], + "view": "timeSeries", + "stacked": false, + "region": "REGION", + "title": "Target Cluster Used Space", + "period": 60, + "stat": "Average", + "yAxis": { + "left": { + "label": "GB", + "showUnits": false + } + } + } + }, + { + "height": 8, + "width": 12, + "y": 17, + "x": 12, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/ES", "ThroughputThrottle", "DomainName", "TC_DOMAIN_NAME", "ClientId", "ACCOUNT_ID", { "id": "m1", "period": 60, "region": "REGION" } ], + [ ".", "IopsThrottle", ".", ".", ".", ".", { "period": 60, "region": "REGION", "id": "m2" } ] + ], + "view": "timeSeries", + "stacked": false, + "region": "REGION", + "title": "Target Cluster EBS Throttling", + "period": 60, + "stat": "Average" + } + }, + { + "height": 1, + "width": 24, + "y": 25, + "x": 0, + "type": "text", + "properties": { + "markdown": "# Reindex-From-Snapshot Workers", + "background": "transparent" + } + }, + { + "height": 8, + "width": 12, + "y": 26, + "x": 0, + "type": "metric", + "properties": { + "metrics": [ + [ "OpenSearchMigrations", "bytesSent", "OTelLib", "documentMigration", { "region": "REGION", "label": "Bytes Sent - MIN - ${MIN}, MAX - ${MAX}, AVG - ${AVG}" } ] + ], + "period": 60, + "region": "REGION", + "stacked": false, + "title": "RFS Reindexing Traffic", + "view": "timeSeries", + "stat": "Sum", + "yAxis": { + "left": { + "label": "Bytes", + "showUnits": false + } + } + } + }, + { + "height": 8, + "width": 12, + "y": 26, + "x": 12, + "type": "metric", + "properties": { + "metrics": [ + [ { "expression": "METRICS()/PERIOD(m1)*60", "id" : "e1", "region": "REGION" } ], + [ "AWS/ECS", "CPUUtilization", "ServiceName", "migration-MA_STAGE-reindex-from-snapshot", "ClusterName", "migration-MA_STAGE-ecs-cluster", { "region": "REGION", "label": "RFS Workers - MIN - ${MIN}, MAX - ${MAX}, AVG - ${AVG}", "id": "m1", "visible": false } ] + ], + "period": 60, + "region": "REGION", + "stacked": false, + "title": "RFS Workers Reporting In", + "view": "timeSeries", + "stat": "SampleCount" + } + }, + { + "height": 8, + "width": 12, + "y": 34, + "x": 0, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/ECS", "CPUUtilization", "ServiceName", "migration-MA_STAGE-reindex-from-snapshot", "ClusterName", "migration-MA_STAGE-ecs-cluster", { "stat": "Minimum", "region": "REGION", "color": "#2ca02c" } ], + [ "...", { "stat": "Average", "region": "REGION", "color": "#1f77b4" } ], + [ "...", { "stat": "Maximum", "region": "REGION", "color": "#d62728" } ] + ], + "period": 60, + "region": "REGION", + "stacked": false, + "title": "RFS CPU utilization", + "view": "timeSeries" + } + }, + { + "height": 8, + "width": 12, + "y": 34, + "x": 12, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/ECS", "MemoryUtilization", "ServiceName", "migration-MA_STAGE-reindex-from-snapshot", "ClusterName", "migration-MA_STAGE-ecs-cluster", { "stat": "Minimum", "region": "REGION", "color": "#2ca02c" } ], + [ "...", { "stat": "Average", "region": "REGION", "color": "#1f77b4" } ], + [ "...", { "stat": "Maximum", "region": "REGION", "color": "#d62728" } ] + ], + "period": 60, + "region": "REGION", + "stacked": false, + "title": "RFS Memory utilization", + "view": "timeSeries" + } + } + ] +} \ No newline at end of file diff --git a/deployment/cdk/opensearch-service-migration/lib/service-stacks/reindex-from-snapshot-stack.ts b/deployment/cdk/opensearch-service-migration/lib/service-stacks/reindex-from-snapshot-stack.ts index ff69e4683..38d6a9b01 100644 --- a/deployment/cdk/opensearch-service-migration/lib/service-stacks/reindex-from-snapshot-stack.ts +++ b/deployment/cdk/opensearch-service-migration/lib/service-stacks/reindex-from-snapshot-stack.ts @@ -21,7 +21,43 @@ import { import { RFSBackfillYaml, SnapshotYaml } from "../migration-services-yaml"; import { OtelCollectorSidecar } from "./migration-otel-collector-sidecar"; import { SharedLogFileSystem } from "../components/shared-log-file-system"; - +import { CfnDashboard } from "aws-cdk-lib/aws-cloudwatch"; +import * as rfsDashboard from '../components/reindex-from-snapshot-dashboard.json'; + + +interface DashboardVariable { + id: string; + defaultValue: string; + } + + function setDefaultValueForVariable(variables: DashboardVariable[], variableName: string, defaultValue: string): DashboardVariable[] { + for (const variable of variables) { + if (variable.id === variableName) { + variable.defaultValue = defaultValue; + break; + } + } + return variables; + } + + interface DashboardBody { + variables: DashboardVariable[]; + } + + function setAccountIdForDashboard(dashboardBody: DashboardBody, account: string): DashboardBody { + dashboardBody.variables = setDefaultValueForVariable(dashboardBody.variables, 'ACCOUNT_ID', account); + return dashboardBody; + } + + function setRegionForDashboard(dashboardBody: DashboardBody, region: string): DashboardBody { + dashboardBody.variables = setDefaultValueForVariable(dashboardBody.variables, 'REGION', region); + return dashboardBody; + } + + function setStageForDashboard(dashboardBody: DashboardBody, stage: string): DashboardBody { + dashboardBody.variables = setDefaultValueForVariable(dashboardBody.variables, 'MA_STAGE', stage); + return dashboardBody; + } export interface ReindexFromSnapshotProps extends StackPropsExt { readonly vpc: IVpc, @@ -191,6 +227,14 @@ export class ReindexFromSnapshotStack extends MigrationServiceCore { ...props }); + let dashboard = setAccountIdForDashboard(rfsDashboard, this.account) + dashboard = setRegionForDashboard(dashboard, this.region) + dashboard = setStageForDashboard(dashboard, props.stage) + new CfnDashboard(this, 'RFSDashboard', { + dashboardName: `MigrationAssistant_ReindexFromSnapshot_${props.stage}_Dashboard`, + dashboardBody: JSON.stringify(dashboard) + }); + this.rfsBackfillYaml = new RFSBackfillYaml(); this.rfsBackfillYaml.ecs.cluster_name = `migration-${props.stage}-ecs-cluster`; this.rfsBackfillYaml.ecs.service_name = `migration-${props.stage}-reindex-from-snapshot`;