Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add and parallelize target-status ClusterPodMonitoring e2e tests #641

Merged
merged 2 commits into from
Oct 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ ClusterPodMonitoringList is a list of ClusterPodMonitorings.

## ClusterPodMonitoringSpec

ClusterPodMonitoringSpec contains specification parameters for PodMonitoring.
ClusterPodMonitoringSpec contains specification parameters for ClusterPodMonitoring.


<em>appears in: [ClusterPodMonitoring](#clusterpodmonitoring)</em>
Expand Down
320 changes: 108 additions & 212 deletions e2e/collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (

gcm "cloud.google.com/go/monitoring/apiv3/v2"
gcmpb "cloud.google.com/go/monitoring/apiv3/v2/monitoringpb"
"github.com/GoogleCloudPlatform/prometheus-engine/e2e/kubeutil"
"github.com/google/go-cmp/cmp"
"google.golang.org/api/iterator"
"google.golang.org/protobuf/types/known/timestamppb"
Expand All @@ -50,9 +51,100 @@ func TestCollector(t *testing.T) {
// We could simply verify that the full collection chain works once. But validating
// more fine-grained stages makes debugging a lot easier.
t.Run("deployed", tctx.subtest(testCollectorDeployed))
t.Run("self-podmonitoring", tctx.subtest(testCollectorSelfPodMonitoring))
t.Run("self-clusterpodmonitoring", tctx.subtest(testCollectorSelfClusterPodMonitoring))
t.Run("target-status", tctx.subtest(testCollectorTargetStatus))

t.Run("scrape", tctx.subtest(func(ctx context.Context, t *OperatorContext) {
t.createOperatorConfigFrom(ctx, monitoringv1.OperatorConfig{
Features: monitoringv1.OperatorFeatures{
TargetStatus: monitoringv1.TargetStatusSpec{
Enabled: true,
},
},
})

t.Run("self-podmonitoring", tctx.subtest(func(ctx context.Context, t *OperatorContext) {
t.Parallel()
testCollector(ctx, t, &monitoringv1.PodMonitoring{
ObjectMeta: metav1.ObjectMeta{
Name: "collector-podmon",
Namespace: t.namespace,
},
Spec: monitoringv1.PodMonitoringSpec{
Selector: metav1.LabelSelector{
MatchLabels: map[string]string{
operator.LabelAppName: operator.NameCollector,
testLabel: t.GetOperatorTestLabelValue(),
},
},
Endpoints: selfScrapeEndpointConfig(),
},
})
}))
t.Run("self-clusterpodmonitoring", tctx.subtest(func(ctx context.Context, t *OperatorContext) {
t.Parallel()
testCollector(ctx, t, &monitoringv1.ClusterPodMonitoring{
ObjectMeta: metav1.ObjectMeta{
Name: "collector-cmon",
},
Spec: monitoringv1.ClusterPodMonitoringSpec{
Selector: metav1.LabelSelector{
MatchLabels: map[string]string{
operator.LabelAppName: operator.NameCollector,
testLabel: t.GetOperatorTestLabelValue(),
},
},
Endpoints: selfScrapeEndpointConfig(),
},
})
}))

const appName = "tls-insecure"
deployment, err := SyntheticAppDeploy(ctx, tctx.Client(), tctx.namespace, appName, []string{})
if err != nil {
tctx.Fatal(err)
}

if err := kubeutil.WaitForDeploymentReady(ctx, tctx.Client(), tctx.namespace, appName); err != nil {
tctx.Fatalf("failed to start app: %s", err)
}
t.Run("synthetic-podmonitoring", tctx.subtest(func(ctx context.Context, t *OperatorContext) {
t.Parallel()
testCollector(ctx, t, &monitoringv1.PodMonitoring{
ObjectMeta: metav1.ObjectMeta{
Name: "synthetic-podmon",
Namespace: t.namespace,
},
Spec: monitoringv1.PodMonitoringSpec{
Selector: metav1.LabelSelector{
MatchLabels: deployment.Spec.Template.Labels,
},
Endpoints: []monitoringv1.ScrapeEndpoint{
{
Port: intstr.FromString(SyntheticAppPortName),
},
},
},
})
}))
t.Run("synthetic-clusterpodmonitoring", tctx.subtest(func(ctx context.Context, t *OperatorContext) {
t.Parallel()
testCollector(ctx, t, &monitoringv1.ClusterPodMonitoring{
ObjectMeta: metav1.ObjectMeta{
Name: "synthetic-cmon",
},
Spec: monitoringv1.ClusterPodMonitoringSpec{
Selector: metav1.LabelSelector{
MatchLabels: deployment.Spec.Template.Labels,
},
Endpoints: []monitoringv1.ScrapeEndpoint{
{
Port: intstr.FromString(SyntheticAppPortName),
},
},
},
})
}))
}))

t.Run("scrape-kubelet", tctx.subtest(testCollectorScrapeKubelet))
}

Expand Down Expand Up @@ -162,230 +254,34 @@ func selfScrapeEndpointConfig() []monitoringv1.ScrapeEndpoint {
}
}

func checkStatusConditions(status *monitoringv1.PodMonitoringStatus, expected int) error {
if size := len(status.Conditions); size == 0 {
return errors.New("empty conditions")
} else if size != expected {
return fmt.Errorf("expected %d conditions, but got: %d", expected, size)
}

for _, condition := range status.Conditions {
if condition.Type != monitoringv1.ConfigurationCreateSuccess {
return fmt.Errorf("condition is not successful: %s", condition.Type)
}
}
return nil
}

func checkStatusEndpoints(status *monitoringv1.PodMonitoringStatus, expected int) error {
endpointStatuses := status.EndpointStatuses
if size := len(endpointStatuses); size == 0 {
return errors.New("empty endpoint status")
} else if size != expected {
return fmt.Errorf("expected %d endpoint, but got: %d", expected, size)
}

for _, status := range endpointStatuses {
var err error
if status.UnhealthyTargets != 0 {
err = fmt.Errorf("unhealthy targets: %d", status.UnhealthyTargets)
} else if status.CollectorsFraction != "1" {
err = fmt.Errorf("collectors failed: %s", status.CollectorsFraction)
} else if len(status.SampleGroups) == 0 {
err = errors.New("missing sample groups")
} else if len(status.SampleGroups[0].SampleTargets) == 0 {
err = fmt.Errorf("missing sample targets: %d", status.SampleGroups[0].Count)
}
if err != nil {
return fmt.Errorf("unhealthy endpoint status %q: %w", status.Name, err)
}
}
return nil
}

// testCollectorSelfPodMonitoring sets up pod monitoring of the collector itself
// and waits for samples to become available in Cloud Monitoring.
func testCollectorSelfPodMonitoring(ctx context.Context, t *OperatorContext) {
// The operator should configure the collector to scrape itself and its metrics
// should show up in Cloud Monitoring shortly after.
name := "collector-podmon"
pm := &monitoringv1.PodMonitoring{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: t.namespace,
},
Spec: monitoringv1.PodMonitoringSpec{
Selector: metav1.LabelSelector{
MatchLabels: map[string]string{
operator.LabelAppName: operator.NameCollector,
},
},
Endpoints: selfScrapeEndpointConfig(),
},
}

// testCollector sets up pod monitoring and waits for samples to become available in GCM.
func testCollector(ctx context.Context, t *OperatorContext, pm monitoringv1.PodMonitoringCRD) {
if err := t.Client().Create(ctx, pm); err != nil {
t.Fatalf("create collector PodMonitoring: %s", err)
}
t.Logf("Waiting for PodMonitoring %q to be processed", name)

resVer := ""
var err error
pollErr := wait.Poll(time.Second, 1*time.Minute, func() (bool, error) {
if err = t.Client().Get(ctx, client.ObjectKeyFromObject(pm), pm); err != nil {
return false, fmt.Errorf("getting PodMonitoring failed: %w", err)
}
// Ensure no status update cycles.
// This is not a perfect check as it's possible the get call returns before the operator
// would sync again, however it can serve as a valuable guardrail in case sporadic test
// failures start happening due to update cycles.
if resVer != pm.ResourceVersion {
resVer = pm.ResourceVersion
err = errors.New("waiting for resource version to stabilize")
return false, nil
}

if err = checkStatusConditions(&pm.Status, 1); err != nil {
return false, nil
}
return true, nil
})
if pollErr != nil {
if errors.Is(pollErr, wait.ErrWaitTimeout) && err != nil {
t.Errorf("unable to validate status: %s", err)
} else {
t.Error("unable to validate status due to timeout")
}
}

if !skipGCM {
t.Log("Waiting for up metrics for collector targets")
validateCollectorUpMetrics(ctx, t, name)
t.Fatalf("create collector: %s", err)
}
}
t.Logf("Waiting for %q to be processed", pm.GetName())

// testCollectorSelfClusterPodMonitoring sets up pod monitoring of the collector itself
// and waits for samples to become available in Cloud Monitoring.
func testCollectorSelfClusterPodMonitoring(ctx context.Context, t *OperatorContext) {
// The operator should configure the collector to scrape itself and its metrics
// should show up in Cloud Monitoring shortly after.
name := "collector-cmon"
pm := &monitoringv1.ClusterPodMonitoring{
ObjectMeta: metav1.ObjectMeta{
Name: name,
},
Spec: monitoringv1.ClusterPodMonitoringSpec{
Selector: metav1.LabelSelector{
MatchLabels: map[string]string{
operator.LabelAppName: operator.NameCollector,
},
},
Endpoints: selfScrapeEndpointConfig(),
},
if err := WaitForPodMonitoringReady(ctx, t.Client(), pm, true); err != nil {
t.Errorf("unable to validate status: %s", err)
}

if err := t.Client().Create(ctx, pm); err != nil {
t.Fatalf("create ClusterPodMonitoring: %s", err)
}
t.Logf("Waiting for ClusterPodMonitoring %q to be processed", name)

resVer := ""
var err error
pollErr := wait.Poll(time.Second, 1*time.Minute, func() (bool, error) {
if pollErr := wait.Poll(3*time.Second, 2*time.Minute, func() (bool, error) {
if err = t.Client().Get(ctx, client.ObjectKeyFromObject(pm), pm); err != nil {
return false, fmt.Errorf("getting ClusterPodMonitoring failed: %w", err)
}
// Ensure no status update cycles.
// This is not a perfect check as it's possible the get call returns before the operator
// would sync again, however it can serve as a valuable guardrail in case sporadic test
// failures start happening due to update cycles.
if resVer != pm.ResourceVersion {
resVer = pm.ResourceVersion
err = errors.New("waiting for resource version to stabilize")
return false, nil
}

if err = checkStatusConditions(&pm.Status, 1); err != nil {
return false, nil
}
return true, nil
})
if pollErr != nil {
err = IsPodMonitoringSuccess(pm, true)
return err == nil, nil
}); pollErr != nil {
if errors.Is(pollErr, wait.ErrWaitTimeout) && err != nil {
t.Errorf("unable to validate status: %s", err)
} else {
t.Error("unable to validate status due to timeout")
pollErr = err
}
t.Errorf("status does not indicate success: %s", pollErr)
}

if !skipGCM {
t.Log("Waiting for up metrics for collector targets")
validateCollectorUpMetrics(ctx, t, name)
}
}

// testCollectorTargetStatus sets up pod monitoring of the collector itself and
// checks target status.
func testCollectorTargetStatus(ctx context.Context, t *OperatorContext) {
t.createOperatorConfigFrom(ctx, monitoringv1.OperatorConfig{
Features: monitoringv1.OperatorFeatures{
TargetStatus: monitoringv1.TargetStatusSpec{
Enabled: true,
},
},
})

name := "collector-podmon-target-status"
pm := &monitoringv1.PodMonitoring{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: t.namespace,
},
Spec: monitoringv1.PodMonitoringSpec{
Selector: metav1.LabelSelector{
MatchLabels: map[string]string{
operator.LabelAppName: operator.NameCollector,
},
},
Endpoints: selfScrapeEndpointConfig(),
},
}

if err := t.Client().Create(ctx, pm); err != nil {
t.Fatalf("create collector PodMonitoring: %s", err)
}
t.Logf("Waiting for PodMonitoring %q to be processed", name)

resVer := ""
var err error
pollErr := wait.Poll(time.Second, 4*time.Minute, func() (bool, error) {
if err = t.Client().Get(ctx, client.ObjectKeyFromObject(pm), pm); err != nil {
return false, fmt.Errorf("getting PodMonitoring failed: %w", err)
}

// Ensure no status update cycles.
// This is not a perfect check as it's possible the get call returns before the operator
// would sync again, however it can serve as a valuable guardrail in case sporadic test
// failures start happening due to update cycles.
if resVer != pm.ResourceVersion {
resVer = pm.ResourceVersion
err = errors.New("waiting for resource version to stabilize")
return false, nil
}

if err = checkStatusConditions(&pm.Status, 1); err != nil {
return false, nil
}
if err = checkStatusEndpoints(&pm.Status, len(selfScrapeEndpointConfig())); err != nil {
return false, nil
}
return true, nil
})
if pollErr != nil {
if errors.Is(pollErr, wait.ErrWaitTimeout) && err != nil {
pollErr = err
}
t.Errorf("unable to validate status: %s", pollErr)
validateCollectorUpMetrics(ctx, t, pm.GetName())
}
}

Expand Down
Loading