Skip to content

Commit

Permalink
Feat/datadog aurora monitoring (#2230)
Browse files Browse the repository at this point in the history
* Added annotations to get argo controller to send metrics to Datadog

* Initial commit of the script to set up DB monitoring for Aurora. Need some guidance on how we want to manage config.

* Starting in on the password

* Mostly finished now, but we'll need to test the full end-to-end process

* Also had to update datadog files for the argo workflow metrics, as we'll need to modify the Helm release

* Cleaned up some debugging items. We'll have to redo the method for connecting to the DBs, we're going to try and read from g3auto instead.

* Need to run final tests.

* Committing work to switch to master.

* Switching over to set up revproy

* Finalizing the ArgoCD setup script.

* Fixed a few errors

* Finalized the kube-setup-aurora-monitoring script.

* Added logic for calling kube-setup-aurora-monitoring to kube-setup-datadog, so that it will get called even if we do a new setup.

* Update values.yaml
  • Loading branch information
AidanHilt authored May 10, 2023
1 parent 3e9352b commit a3835ad
Show file tree
Hide file tree
Showing 5 changed files with 219 additions and 1 deletion.
169 changes: 169 additions & 0 deletions gen3/bin/kube-setup-aurora-monitoring.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
source "${GEN3_HOME}/gen3/lib/utils.sh"
gen3_load "gen3/gen3setup"

databaseArray=()
databaseFarmArray=()

# This function is going to retrieve and return all the top-level entries from creds.json, that has the db items we want.
# This way, we can use this information while we're creating schemas and the like
get_all_dbs() {
databases=$(jq 'to_entries[] | select (.value.db_password) | .key' $(gen3_secrets_folder)/creds.json)

OLD_IFS=$IFS
IFS=$'\n' databaseArray=($databases)
IFS=$OLD_IFS
}

get_all_dbs_db_farm() {
databases=$(jq 'to_entries[] | .key' $(gen3_secrets_folder)/g3auto/dbfarm/servers.json)

OLD_IFS=$IFS
IFS=$'\n' databaseFarmArray=($databases)
IFS=$OLD_IFS
}

create_new_datadog_user() {
# Generate a new password for the datadog user in psql
datadogPsqlPassword=$(random_alphanumeric)

# update creds.json
if [ ! -d "$(gen3_secrets_folder)/datadog" ]
then
mkdir "$(gen3_secrets_folder)/datadog"
fi

if [ ! -s "$(gen3_secrets_folder)/datadog/datadog_db_users" ]
then
echo "{}" > "$(gen3_secrets_folder)/datadog/datadog_db_users.json"
fi

output=$(jq --arg host "$1" --arg password "$datadogPsqlPassword" '.[$host].datadog_db_password=$password' "$(gen3_secrets_folder)/datadog/datadog_db_users.json")
echo "$output" > "$(gen3_secrets_folder)/datadog/datadog_db_users.json"

# Instead of grabbing username, password, and all that, and doing our connection, we'll just figure out
# which short name (i.e., server1, server2, etc) corresponds to our host, and connect that way.
# Saves a few lines of code.
shortname=$(jq --arg host "$1" 'to_entries[] | select (.value.db_host == $host) | .key' $(gen3_secrets_folder)/g3auto/dbfarm/servers.json | tr -d '"')

# Create the Datadog user in the database
if gen3 psql $shortname -c "SELECT 1 FROM pg_roles WHERE rolname='datadog'" | grep -q 1;
then
gen3 psql $shortname -c "ALTER USER datadog WITH password '$datadogPsqlPassword';"
else
gen3 psql $shortname -c "CREATE USER datadog WITH password '$datadogPsqlPassword';"
fi

echo $datadogPsqlPassword
}

get_datadog_db_password() {
# Create the Datadog user
datadogPsqlPassword="$(jq --arg host "$1" '.[$host].datadog_db_password' < $(gen3_secrets_folder)/datadog/datadog_db_users.json)"
if [[ -z "$datadogPsqlPassword" ]]
then
datadogPsqlPassword=$(create_new_datadog_user $1)
fi

echo $datadogPsqlPassword
}

create_schema_and_function() {
svc=$(echo $1 | tr -d '"')
host=$(jq --arg service "$svc" '.[$service].db_host' $(gen3_secrets_folder)/creds.json | tr -d '"')
database=$(jq --arg service "$svc" '.[$service].db_database' $(gen3_secrets_folder)/creds.json | tr -d '"')

username=$(jq --arg host "$host" 'map(select(.db_host==$host))[0] | .db_username' $(gen3_secrets_folder)/g3auto/dbfarm/servers.json | tr -d '"')
password=$(jq --arg host "$host" 'map(select(.db_host==$host))[0] | .db_password' $(gen3_secrets_folder)/g3auto/dbfarm/servers.json | tr -d '"')

ddPass=$(get_datadog_db_password $host)

PGPASSWORD=$password psql -h $host -U $username -d $database -t <<SQL |
CREATE SCHEMA datadog;
GRANT USAGE ON SCHEMA datadog TO datadog;
GRANT USAGE ON SCHEMA public TO datadog;
GRANT pg_monitor TO datadog;
CREATE EXTENSION IF NOT EXISTS pg_stat_statements;
SQL

PGPASSWORD=$password psql -h $host -U $username -d $database -t <<SQL |
CREATE OR REPLACE FUNCTION datadog.explain_statement(
l_query TEXT,
OUT explain JSON
)
RETURNS SETOF JSON AS
\$\$
DECLARE
curs REFCURSOR;
plan JSON;
BEGIN
OPEN curs FOR EXECUTE pg_catalog.concat('EXPLAIN (FORMAT JSON) ', l_query);
FETCH curs INTO plan;
CLOSE curs;
RETURN QUERY SELECT plan;
END;
\$\$
LANGUAGE 'plpgsql'
RETURNS NULL ON NULL INPUT
SECURITY DEFINER;
SQL

gen3_log_info "Succesfully added the function and schema"
}

if [ $# -eq 0 ]; then
echo "Error: No argument provided. You must provide the name of the Aurora cluster to operate against"
exit 1
fi

get_all_dbs databaseArray

# Loop through every database, creating the schema and function
for db in "${databaseArray[@]}"
do
create_schema_and_function $db
done


# Set up the agent
#==============================

# Get the instances in the Aurora cluster
# We'll take the name of the cluster as the first argument, so we won't need to go digging for that. Instead, we'll just
# pull out connection strings and ports for each instance

instances=$(aws rds describe-db-instances --filters "Name=db-cluster-id,Values=$1" --no-paginate | jq '.DBInstances[].Endpoint.Address,.DBInstances[].Endpoint.Port' | tr -d '"')
clusterEndpoint=$(aws rds describe-db-cluster-endpoints --db-cluster-identifier "$1" | jq ' .DBClusterEndpoints[0].Endpoint' | tr -d '"')

postgresString=""
for instance in "${instances[@]}"
do
instanceArray=($instance)
datadogUserPassword=$(jq --arg instance "$clusterEndpoint" '.[$instance].datadog_db_password' $(gen3_secrets_folder)/datadog/datadog_db_users.json | tr -d '"')
postgresString+=$(cat /home/aidan/cloud-automation/kube/services/datadog/postgres.yaml | yq --arg url ${instanceArray[0]} --yaml-output '.instances[0].host = $url' | yq --arg password $datadogUserPassword --yaml-output '.instances[0].password = $password')
done

confd=$(yq -n --yaml-output --arg postgres "$postgresString" '.clusterAgent.confd."postgres.yaml" = $postgres | .clusterChecksRunner.enabled = true')

#We'll need two ways to do this, one for commons where Datadog is managed by ArgoCD, and another for commons where
#it's directly installed

if kubectl get applications.argoproj.io -n argocd datadog-application &> /dev/null
then
gen3_log_info "We detected an ArgoCD application named 'datadog-application,' so we're modifying that"

patch=$(yq -n --yaml-output --arg confd "$confd" '.spec.source.helm.values = $confd')

echo "$patch" > /tmp/confd.yaml

kubectl patch applications.argoproj.io datadog-application --type merge -n argocd --patch-file /tmp/confd.yaml

else
gen3_log_info "We didn't detect an ArgoCD application named 'datadog-application,' so we're going to reinstall the DD Helm chart"

(cat kube/services/datadog/values.yaml | yq --arg endpoints "$postgresString" --yaml-output '.clusterAgent.confd."postgres.yaml" = $endpoints | .clusterChecksRunner.enabled = true') > $(gen3_secrets_folder)/datadog/datadog_values.yaml
helm repo add datadog https://helm.datadoghq.com --force-update 2> >(grep -v 'This is insecure' >&2)
helm repo update 2> >(grep -v 'This is insecure' >&2)
helm upgrade --install datadog -f "$(gen3_secrets_folder)/datadog/datadog_values.yaml" datadog/datadog -n datadog --version 3.6.4 2> >(grep -v 'This is insecure' >&2)
fi
36 changes: 35 additions & 1 deletion gen3/bin/kube-setup-datadog.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ if [[ "$ctxNamespace" == "default" || "$ctxNamespace" == "null" ]]; then
g3kubectl delete namespace datadog
g3kubectl create namespace datadog
fi
# create namespace if it doens't exist
# create namespace if it doesn't exist
if (! g3kubectl get namespace datadog > /dev/null 2>&1); then
gen3_log_info "Creating namespace datadog"
g3kubectl create namespace datadog
Expand All @@ -53,6 +53,40 @@ if [[ "$ctxNamespace" == "default" || "$ctxNamespace" == "null" ]]; then
else
helm upgrade --install datadog -f "$GEN3_HOME/kube/services/datadog/values.yaml" datadog/datadog -n datadog --version 3.6.4 2> >(grep -v 'This is insecure' >&2)
fi

# Check the manifest to see if we want to set up database monitoring
# Get the name of the cluster
# Run the command

if g3k_manifest_lookup .datadog.db_monitoring_enabled &> /dev/null; then
gen3_log_info "Detected that this commons is using database monitoring. Setting that up now."
clusters=$(aws rds describe-db-clusters --query "DBClusters[].DBClusterIdentifier" --output text)
clusterArray=($clusters)

for i in "${!clusterArray[@]}"; do
echo "$((i+1)). ${clusterArray[i]}"
done

selected="false"
selection=""

until [ $selected == "true" ]
do
read -p "Enter the number of the cluster you want to monitor (1-${#clusterArray[@]}): " num
if [[ "$num" =~ ^[0-9]+$ ]] && ((num >= 1 && num <= ${#clusterArray[@]})); then
echo "You entered: $num"
selected="true"
selection=${clusterArray[$num - 1]}
else
echo "Invalid input: $num"
fi
done

gen3 kube-setup-aurora-monitoring "$selection"
else
gen3_log_info "No database monitoring detected. We're done here."
fi

)
else
gen3_log_info "kube-setup-datadog exiting - datadog already deployed, use --force to redeploy"
Expand Down
4 changes: 4 additions & 0 deletions kube/services/datadog/datadog_db_user.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"datadog_db_user": "datadog",
"datadog_db_password": null
}
8 changes: 8 additions & 0 deletions kube/services/datadog/postgres.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
cluster_check: true
init_config:
instances:
- dbm: true
host:
port: 5432
username: datadog
password:
3 changes: 3 additions & 0 deletions kube/services/datadog/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ datadog:
useHostPort: true
nonLocalTraffic: true

#This is used to configure a lot of checks that Datadog does. Normally, we would annotate a service, but since we
#use aurora, we'll have to configure from confd instead

#Enables Optional Universal Service Monitoring
## ref: https://docs.datadoghq.com/tracing/universal_service_monitoring/?tab=helm
serviceMonitoring:
Expand Down

0 comments on commit a3835ad

Please sign in to comment.