Skip to content

Commit

Permalink
Export geth metrics on VM testnet (#1351)
Browse files Browse the repository at this point in the history
  • Loading branch information
tkporter authored and celo-ci-bot-user committed Nov 11, 2019
1 parent 25b62b5 commit ba60964
Show file tree
Hide file tree
Showing 20 changed files with 314 additions and 23 deletions.
5 changes: 5 additions & 0 deletions packages/celotool/src/cmds/deploy/destroy/vm-testnet.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import { switchToClusterFromEnv } from 'src/lib/cluster'
import { removeHelmRelease } from 'src/lib/prom-to-sd-utils'
import { destroy } from 'src/lib/vm-testnet-utils'
import { DestroyArgv } from '../../deploy/destroy'

Expand All @@ -6,5 +8,8 @@ export const describe = 'destroy an existing VM-based testnet'
export const builder = {}

export const handler = async (argv: DestroyArgv) => {
await switchToClusterFromEnv()
await destroy(argv.celoEnv)
// destroy prometheus to stackdriver statefulset
await removeHelmRelease(argv.celoEnv)
}
11 changes: 10 additions & 1 deletion packages/celotool/src/cmds/deploy/initial/vm-testnet.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
import { deploy } from '../../../lib/vm-testnet-utils'
import { createClusterIfNotExists, setupCluster, switchToClusterFromEnv } from 'src/lib/cluster'
import { installHelmChart } from 'src/lib/prom-to-sd-utils'
import { deploy } from 'src/lib/vm-testnet-utils'
import { InitialArgv } from '../../deploy/initial'

export const command = 'vm-testnet'
export const describe = 'upgrade a testnet on a VM'
export const builder = {}

export const handler = async (argv: InitialArgv) => {
// set up Kubernetes cluster that will have prometheus to stackdriver statefulset
const createdCluster = await createClusterIfNotExists()
await switchToClusterFromEnv()
await setupCluster(argv.celoEnv, createdCluster)
// deploy VM testnet with Terraform
await deploy(argv.celoEnv)
// deploy prom to sd statefulset
await installHelmChart(argv.celoEnv)
}
6 changes: 6 additions & 0 deletions packages/celotool/src/cmds/deploy/upgrade/vm-testnet.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import { switchToClusterFromEnv } from 'src/lib/cluster'
import { upgradeHelmChart } from 'src/lib/prom-to-sd-utils'
import { deploy, taintTestnet, untaintTestnet } from 'src/lib/vm-testnet-utils'
import yargs from 'yargs'
import { UpgradeArgv } from '../../deploy/upgrade'
Expand All @@ -18,10 +20,14 @@ export const builder = (argv: yargs.Argv) => {
}

export const handler = async (argv: VmTestnetArgv) => {
await switchToClusterFromEnv()

let onDeployFailed = () => Promise.resolve()
if (argv.reset) {
onDeployFailed = () => untaintTestnet(argv.celoEnv)
await taintTestnet(argv.celoEnv)
}
await deploy(argv.celoEnv, onDeployFailed)
// upgrade prom to sd statefulset
await upgradeHelmChart(argv.celoEnv)
}
4 changes: 4 additions & 0 deletions packages/celotool/src/lib/env-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ export enum envVar {
GETH_ACCOUNT_SECRET = 'GETH_ACCOUNT_SECRET',
GETH_BOOTNODE_DOCKER_IMAGE_REPOSITORY = 'GETH_BOOTNODE_DOCKER_IMAGE_REPOSITORY',
GETH_BOOTNODE_DOCKER_IMAGE_TAG = 'GETH_BOOTNODE_DOCKER_IMAGE_TAG',
GETH_EXPORTER_DOCKER_IMAGE_REPOSITORY = 'GETH_EXPORTER_DOCKER_IMAGE_REPOSITORY',
GETH_EXPORTER_DOCKER_IMAGE_TAG = 'GETH_EXPORTER_DOCKER_IMAGE_TAG',
GETH_NODES_BACKUP_CRONJOB_ENABLED = 'GETH_NODES_BACKUP_CRONJOB_ENABLED',
GETH_NODE_DOCKER_IMAGE_REPOSITORY = 'GETH_NODE_DOCKER_IMAGE_REPOSITORY',
GETH_NODE_DOCKER_IMAGE_TAG = 'GETH_NODE_DOCKER_IMAGE_TAG',
Expand All @@ -55,6 +57,8 @@ export enum envVar {
NEXMO_KEY = 'NEXMO_KEY',
NEXMO_SECRET = 'NEXMO_SECRET',
NOTIFICATION_SERVICE_FIREBASE_DB = 'NOTIFICATION_SERVICE_FIREBASE_DB',
PROMTOSD_EXPORT_INTERVAL = 'PROMTOSD_EXPORT_INTERVAL',
PROMTOSD_SCRAPE_INTERVAL = 'PROMTOSD_SCRAPE_INTERVAL',
SMS_RETRIEVER_HASH_CODE = 'SMS_RETRIEVER_HASH_CODE',
STACKDRIVER_MONITORING_DASHBOARD = 'STACKDRIVER_MONITORING_DASHBOARD',
STACKDRIVER_NOTIFICATION_APPLICATIONS_PREFIX = 'STACKDRIVER_NOTIFICATION_APPLICATIONS_PREFIX',
Expand Down
85 changes: 85 additions & 0 deletions packages/celotool/src/lib/prom-to-sd-utils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import sleep from 'sleep-promise'
import { envVar, fetchEnv } from 'src/lib/env-utils'
import { installGenericHelmChart, removeGenericHelmChart } from 'src/lib/helm_deploy'
import { getStatefulSetReplicas, scaleResource } from 'src/lib/kubernetes'
import { execCmdWithExitOnFailure } from 'src/lib/utils'
import { getInternalTxNodeIPs, getInternalValidatorIPs } from 'src/lib/vm-testnet-utils'

const helmChartPath = '../helm-charts/prometheus-to-sd'

// This deploys a helm chart to Kubernetes that exports prometheus metrics from
// VM testnets Stackdriver

export async function installHelmChart(celoEnv: string) {
return installGenericHelmChart(
celoEnv,
releaseName(celoEnv),
helmChartPath,
await helmParameters(celoEnv)
)
}

export async function removeHelmRelease(celoEnv: string) {
await removeGenericHelmChart(releaseName(celoEnv))
}

export async function upgradeHelmChart(celoEnv: string) {
console.info(`Upgrading helm release ${releaseName(celoEnv)}`)

const statefulSetName = `${celoEnv}-prom-to-sd`
const replicaCount = await getStatefulSetReplicas(celoEnv, statefulSetName)

console.info('Scaling StatefulSet down to 0...')
await scaleResource(celoEnv, 'statefulset', statefulSetName, 0)
await sleep(5000)

const helmParams = await helmParameters(celoEnv)

const upgradeCmdArgs = `${releaseName(
celoEnv
)} ${helmChartPath} --namespace ${celoEnv} ${helmParams.join(' ')}`

if (process.env.CELOTOOL_VERBOSE === 'true') {
await execCmdWithExitOnFailure(`helm upgrade --debug --dry-run ${upgradeCmdArgs}`)
}
await execCmdWithExitOnFailure(`helm upgrade ${upgradeCmdArgs}`)
console.info(`Helm release ${releaseName(celoEnv)} upgrade successful`)

console.info(`Scaling StatefulSet back up to ${replicaCount}...`)
await scaleResource(celoEnv, 'statefulset', statefulSetName, replicaCount)
}

async function helmParameters(celoEnv: string) {
// The metrics endpoints are only exposed internally
const validatorIpAddresses = await getInternalValidatorIPs(celoEnv)
const validatorCount = parseInt(fetchEnv(envVar.VALIDATORS), 10)
const validatorPodIds = []
for (let i = 0; i < validatorCount; i++) {
validatorPodIds.push(`${celoEnv}-validator-${i}`)
}

const txNodeIpAddresses = await getInternalTxNodeIPs(celoEnv)
const txNodeCount = parseInt(fetchEnv(envVar.TX_NODES), 10)
const txNodePodIds = []
for (let i = 0; i < txNodeCount; i++) {
txNodePodIds.push(`${celoEnv}-tx-node-${i}`)
}

const allIps = validatorIpAddresses.concat(txNodeIpAddresses)
const sources = allIps.map((ip: string) => `http://${ip}:9200/metrics`)

const allPodIds = validatorPodIds.concat(txNodePodIds)

return [
`--set metricsSources.geth="${sources.join('\\,')}"`,
`--set promtosd.scrape_interval=${fetchEnv(envVar.PROMTOSD_SCRAPE_INTERVAL)}`,
`--set promtosd.export_interval=${fetchEnv(envVar.PROMTOSD_EXPORT_INTERVAL)}`,
`--set promtosd.podIds="${allPodIds.join('\\,')}"`,
`--set promtosd.namespaceId=${celoEnv}`,
`--set replicaCount=${validatorCount + txNodeCount}`,
]
}

function releaseName(celoEnv: string) {
return `${celoEnv}-prom-to-sd`
}
12 changes: 12 additions & 0 deletions packages/celotool/src/lib/vm-testnet-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ const testnetEnvVars: TerraformVars = {
geth_verbosity: envVar.GETH_VERBOSITY,
geth_bootnode_docker_image_repository: envVar.GETH_BOOTNODE_DOCKER_IMAGE_REPOSITORY,
geth_bootnode_docker_image_tag: envVar.GETH_BOOTNODE_DOCKER_IMAGE_TAG,
geth_exporter_docker_image_repository: envVar.GETH_EXPORTER_DOCKER_IMAGE_REPOSITORY,
geth_exporter_docker_image_tag: envVar.GETH_EXPORTER_DOCKER_IMAGE_TAG,
geth_node_docker_image_repository: envVar.GETH_NODE_DOCKER_IMAGE_REPOSITORY,
geth_node_docker_image_tag: envVar.GETH_NODE_DOCKER_IMAGE_TAG,
in_memory_discovery_table: envVar.IN_MEMORY_DISCOVERY_TABLE,
Expand Down Expand Up @@ -297,6 +299,16 @@ export async function getTxNodeLoadBalancerIP(celoEnv: string) {
return outputs.tx_node_lb_ip_address.value
}

export async function getInternalValidatorIPs(celoEnv: string) {
const outputs = await getTestnetOutputs(celoEnv)
return outputs.validator_internal_ip_addresses.value
}

export async function getInternalTxNodeIPs(celoEnv: string) {
const outputs = await getTestnetOutputs(celoEnv)
return outputs.tx_node_internal_ip_addresses.value
}

function getTerraformBackendConfigVars(celoEnv: string, terraformModule: string) {
return {
prefix: `${celoEnv}/${terraformModule}`,
Expand Down
51 changes: 42 additions & 9 deletions packages/helm-charts/prometheus-to-sd/templates/deployment.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
apiVersion: apps/v1beta1
kind: Deployment
kind: StatefulSet
metadata:
name: {{ template "prometheus-to-sd.fullname" . }}
labels:
Expand All @@ -9,6 +9,7 @@ metadata:
heritage: {{ .Release.Service }}
spec:
replicas: {{ .Values.replicaCount }}
serviceName: {{ template "prometheus-to-sd.fullname" . }}
template:
metadata:
labels:
Expand All @@ -23,16 +24,48 @@ spec:
- name: profiler
containerPort: {{ .Values.port }}
command:
- /monitor
- --stackdriver-prefix=custom.googleapis.com
{{- range $key, $value := .Values.metricsSources }}
- --source={{ $key }}:{{ $value }}
{{- end }}
- --scrape-interval={{ .Values.promtosd.scrape_interval }}
- --export-interval={{ .Values.promtosd.export_interval }}
- /bin/sh
- "-c"
- |-
INDEX=${POD_NAME##*-}
NAMESPACE_ID="{{ .Values.promtosd.namespaceId }}"
NAMESPACE_ID_FLAG=""
[ "$NAMESPACE_ID" ] && NAMESPACE_ID_FLAG="--namespace-id=$NAMESPACE_ID"
POD_ID=`echo -n {{ .Values.promtosd.podIds }} | cut -d ',' -f $((INDEX + 1))`
POD_ID_FLAG=""
[ "$POD_ID" ] && POD_ID_FLAG="--pod-id=$POD_ID"
/monitor \
--stackdriver-prefix=custom.googleapis.com \
{{- range $key, $value := .Values.metricsSources }}
--source={{ $key }}:$(echo -n "{{ $value }}" | cut -d ',' -f $((INDEX + 1))) \
{{- end }}
--scrape-interval={{ .Values.promtosd.scrape_interval }} \
--export-interval={{ .Values.promtosd.export_interval }} \
$POD_ID_FLAG \
$NAMESPACE_ID_FLAG
resources:
{{ toYaml .Values.resources | indent 12 }}
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
{{- if .Values.nodeSelector }}
nodeSelector:
{{ toYaml .Values.nodeSelector | indent 8 }}
{{- end }}
{{- end }}
---

apiVersion: v1
kind: Service
metadata:
name: {{ template "prometheus-to-sd.fullname" . }}
labels:
component: {{ template "prometheus-to-sd.fullname" . }}
spec:
clusterIP: None
selector:
app: {{ template "prometheus-to-sd.name" . }}
10 changes: 8 additions & 2 deletions packages/helm-charts/prometheus-to-sd/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,13 @@ image:
repository: gcr.io/google-containers/prometheus-to-sd
tag: v0.3.2
pullPolicy: IfNotPresent
resources: {}
resources:
requests:
memory: 50M
cpu: 5m
port: 6060
metricsSources: {}
nodeSelector: {}
nodeSelector: {}
promtosd:
podIds: ""
namespaceId: ""
32 changes: 32 additions & 0 deletions packages/terraform-modules/testnet/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ data "terraform_remote_state" "state" {
}
}

locals {
firewall_target_tags_bootnode = ["${var.celo_env}-bootnode"]
firewall_target_tags_node = ["${var.celo_env}-node"]
}

data "google_compute_network" "network" {
name = var.network_name
}
Expand All @@ -28,6 +33,8 @@ resource "google_compute_firewall" "ssh_firewall" {
name = "${var.celo_env}-ssh-firewall"
network = data.google_compute_network.network.name

target_tags = concat(local.firewall_target_tags_bootnode, local.firewall_target_tags_node)

allow {
protocol = "tcp"
ports = ["22"]
Expand All @@ -38,6 +45,8 @@ resource "google_compute_firewall" "geth_firewall" {
name = "${var.celo_env}-geth-firewall"
network = data.google_compute_network.network.name

target_tags = local.firewall_target_tags_node

allow {
protocol = "tcp"
ports = ["30303"]
Expand All @@ -49,10 +58,27 @@ resource "google_compute_firewall" "geth_firewall" {
}
}

resource "google_compute_firewall" "geth_metrics_firewall" {
name = "${var.celo_env}-geth-metrics-firewall"
network = data.google_compute_network.network.name

target_tags = local.firewall_target_tags_node

# allow all IPs internal to the VPC
source_ranges = ["10.0.0.0/8"]

allow {
protocol = "tcp"
ports = ["9200"]
}
}

resource "google_compute_firewall" "rpc_firewall" {
name = "${var.celo_env}-rpc-firewall"
network = data.google_compute_network.network.name

target_tags = local.firewall_target_tags_node

allow {
protocol = "tcp"
ports = ["8545", "8546"]
Expand All @@ -63,6 +89,8 @@ resource "google_compute_firewall" "bootnode_firewall" {
name = "${var.celo_env}-bootnode-firewall"
network = data.google_compute_network.network.name

target_tags = local.firewall_target_tags_bootnode

allow {
protocol = "udp"
ports = ["30301"]
Expand Down Expand Up @@ -93,6 +121,8 @@ module "tx_node" {
gcloud_secrets_bucket = var.gcloud_secrets_bucket
gcloud_vm_service_account_email = var.gcloud_vm_service_account_email
genesis_content_base64 = var.genesis_content_base64
geth_exporter_docker_image_repository = var.geth_exporter_docker_image_repository
geth_exporter_docker_image_tag = var.geth_exporter_docker_image_tag
geth_node_docker_image_repository = var.geth_node_docker_image_repository
geth_node_docker_image_tag = var.geth_node_docker_image_tag
geth_verbosity = var.geth_verbosity
Expand Down Expand Up @@ -123,6 +153,8 @@ module "validator" {
gcloud_secrets_bucket = var.gcloud_secrets_bucket
gcloud_vm_service_account_email = var.gcloud_vm_service_account_email
genesis_content_base64 = var.genesis_content_base64
geth_exporter_docker_image_repository = var.geth_exporter_docker_image_repository
geth_exporter_docker_image_tag = var.geth_exporter_docker_image_tag
geth_node_docker_image_repository = var.geth_node_docker_image_repository
geth_node_docker_image_tag = var.geth_node_docker_image_tag
geth_verbosity = var.geth_verbosity
Expand Down
2 changes: 2 additions & 0 deletions packages/terraform-modules/testnet/modules/bootnode/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ resource "google_compute_instance" "bootnode" {
name = local.name_prefix
machine_type = "n1-standard-1"

tags = [local.name_prefix]

allow_stopping_for_update = true

boot_disk {
Expand Down
Loading

0 comments on commit ba60964

Please sign in to comment.