Skip to content

Commit

Permalink
UPSTREAM: 98781: Allow test invokers to skip test waits before and after
Browse files Browse the repository at this point in the history
A number of e2e tests are useful to run after the system has been
disrupted or is in the progress of being disrupted, but the current
suite and test logic blocks progress waiting for all nodes to be
healthy.

By passing -1 to --minStartupPods or --allowed-not-ready-nodes flags
the caller can bypass wait logic before and after test suites that
would prevent running e2e during disruption. This allows use of parts
of the e2e suite during cluster duress to verify that controllers or
components still function.
  • Loading branch information
smarterclayton authored and soltysh committed Sep 8, 2021
1 parent a05e773 commit 249e45d
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 4 deletions.
6 changes: 6 additions & 0 deletions test/e2e/e2e.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,13 @@ func getDefaultClusterIPFamily(c clientset.Interface) string {
// waitForDaemonSets for all daemonsets in the given namespace to be ready
// (defined as all but 'allowedNotReadyNodes' pods associated with that
// daemonset are ready).
//
// If allowedNotReadyNodes is -1, this method returns immediately without waiting.
func waitForDaemonSets(c clientset.Interface, ns string, allowedNotReadyNodes int32, timeout time.Duration) error {
if allowedNotReadyNodes == -1 {
return nil
}

start := time.Now()
framework.Logf("Waiting up to %v for all daemonsets in namespace '%s' to start",
timeout, ns)
Expand Down
3 changes: 3 additions & 0 deletions test/e2e/framework/node/wait.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,9 @@ func checkWaitListSchedulableNodes(c clientset.Interface) (*v1.NodeList, error)
func CheckReadyForTests(c clientset.Interface, nonblockingTaints string, allowedNotReadyNodes, largeClusterThreshold int) func() (bool, error) {
attempt := 0
return func() (bool, error) {
if allowedNotReadyNodes == -1 {
return true, nil
}
attempt++
var nodesNotReadyYet []v1.Node
opts := metav1.ListOptions{
Expand Down
7 changes: 7 additions & 0 deletions test/e2e/framework/pod/wait.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,14 @@ func errorBadPodsStates(badPods []v1.Pod, desiredPods int, ns, desiredState stri
// waiting. All pods that are in SUCCESS state are not counted.
//
// If ignoreLabels is not empty, pods matching this selector are ignored.
//
// If minPods or allowedNotReadyPods are -1, this method returns immediately
// without waiting.
func WaitForPodsRunningReady(c clientset.Interface, ns string, minPods, allowedNotReadyPods int32, timeout time.Duration, ignoreLabels map[string]string) error {
if minPods == -1 || allowedNotReadyPods == -1 {
return nil
}

ignoreSelector := labels.SelectorFromSet(map[string]string{})
start := time.Now()
e2elog.Logf("Waiting up to %v for all pods (need at least %d) in namespace '%s' to be running and ready",
Expand Down
4 changes: 2 additions & 2 deletions test/e2e/framework/test_context.go
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ func RegisterCommonFlags(flags *flag.FlagSet) {
flags.StringVar(&TestContext.LogexporterGCSPath, "logexporter-gcs-path", "", "Path to the GCS artifacts directory to dump logs from nodes. Logexporter gets enabled if this is non-empty.")
flags.BoolVar(&TestContext.DeleteNamespace, "delete-namespace", true, "If true tests will delete namespace after completion. It is only designed to make debugging easier, DO NOT turn it off by default.")
flags.BoolVar(&TestContext.DeleteNamespaceOnFailure, "delete-namespace-on-failure", true, "If true, framework will delete test namespace on failure. Used only during test debugging.")
flags.IntVar(&TestContext.AllowedNotReadyNodes, "allowed-not-ready-nodes", 0, "If non-zero, framework will allow for that many non-ready nodes when checking for all ready nodes.")
flags.IntVar(&TestContext.AllowedNotReadyNodes, "allowed-not-ready-nodes", 0, "If greater than zero, framework will allow for that many non-ready nodes when checking for all ready nodes. If -1, no waiting will be performed for ready nodes or daemonset pods.")

flags.StringVar(&TestContext.Host, "host", "", fmt.Sprintf("The host, or apiserver, to connect to. Will default to %s if this argument and --kubeconfig are not set.", defaultHost))
flags.StringVar(&TestContext.ReportPrefix, "report-prefix", "", "Optional prefix for JUnit XML reports. Default is empty, which doesn't prepend anything to the default name.")
Expand Down Expand Up @@ -354,7 +354,7 @@ func RegisterClusterFlags(flags *flag.FlagSet) {

flags.StringVar(&cloudConfig.ClusterTag, "cluster-tag", "", "Tag used to identify resources. Only required if provider is aws.")
flags.StringVar(&cloudConfig.ConfigFile, "cloud-config-file", "", "Cloud config file. Only required if provider is azure or vsphere.")
flags.IntVar(&TestContext.MinStartupPods, "minStartupPods", 0, "The number of pods which we need to see in 'Running' state with a 'Ready' condition of true, before we try running tests. This is useful in any cluster which needs some base pod-based services running before it can be used.")
flags.IntVar(&TestContext.MinStartupPods, "minStartupPods", 0, "The number of pods which we need to see in 'Running' state with a 'Ready' condition of true, before we try running tests. This is useful in any cluster which needs some base pod-based services running before it can be used. If set to -1, no pods are checked and tests run straight away.")
flags.DurationVar(&TestContext.SystemPodsStartupTimeout, "system-pods-startup-timeout", 10*time.Minute, "Timeout for waiting for all system pods to be running before starting tests.")
flags.DurationVar(&TestContext.NodeSchedulableTimeout, "node-schedulable-timeout", 30*time.Minute, "Timeout for waiting for all nodes to be schedulable.")
flags.DurationVar(&TestContext.SystemDaemonsetStartupTimeout, "system-daemonsets-startup-timeout", 5*time.Minute, "Timeout for waiting for all system daemonsets to be ready.")
Expand Down
12 changes: 10 additions & 2 deletions test/e2e/framework/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -1020,8 +1020,11 @@ func getNodeEvents(c clientset.Interface, nodeName string) []v1.Event {
// WaitForAllNodesSchedulable waits up to timeout for all
// (but TestContext.AllowedNotReadyNodes) to become scheduable.
func WaitForAllNodesSchedulable(c clientset.Interface, timeout time.Duration) error {
Logf("Waiting up to %v for all (but %d) nodes to be schedulable", timeout, TestContext.AllowedNotReadyNodes)
if TestContext.AllowedNotReadyNodes == -1 {
return nil
}

Logf("Waiting up to %v for all (but %d) nodes to be schedulable", timeout, TestContext.AllowedNotReadyNodes)
return wait.PollImmediate(
30*time.Second,
timeout,
Expand Down Expand Up @@ -1114,11 +1117,16 @@ func RunHostCmdWithRetries(ns, name, cmd string, interval, timeout time.Duration
}
}

// AllNodesReady checks whether all registered nodes are ready.
// AllNodesReady checks whether all registered nodes are ready. Setting -1 on
// TestContext.AllowedNotReadyNodes will bypass the post test node readiness check.
// TODO: we should change the AllNodesReady call in AfterEach to WaitForAllNodesHealthy,
// and figure out how to do it in a configurable way, as we can't expect all setups to run
// default test add-ons.
func AllNodesReady(c clientset.Interface, timeout time.Duration) error {
if TestContext.AllowedNotReadyNodes == -1 {
return nil
}

Logf("Waiting up to %v for all (but %d) nodes to be ready", timeout, TestContext.AllowedNotReadyNodes)

var notReady []*v1.Node
Expand Down

0 comments on commit 249e45d

Please sign in to comment.