tektoncd · tekton-robot · May 29, 2019 · May 23, 2019 · bobcatfish · May 28, 2019
diff --git a/cmd/controller/main.go b/cmd/controller/main.go
@@ -113,7 +113,7 @@ func main() {
 
 	pipelineInformer := pipelineInformerFactory.Tekton().V1alpha1().Pipelines()
 	pipelineRunInformer := pipelineInformerFactory.Tekton().V1alpha1().PipelineRuns()
-	timeoutHandler := reconciler.NewTimeoutHandler(kubeClient, pipelineClient, stopCh, logger)
+	timeoutHandler := reconciler.NewTimeoutHandler(stopCh, logger)
 
 	trc := taskrun.NewController(opt,
 		taskRunInformer,
@@ -141,7 +141,7 @@ func main() {
 	}
 	timeoutHandler.SetTaskRunCallbackFunc(trc.Enqueue)
 	timeoutHandler.SetPipelineRunCallbackFunc(prc.Enqueue)
-	timeoutHandler.CheckTimeouts()
+	timeoutHandler.CheckTimeouts(kubeClient, pipelineClient)
 
 	// Watch the logging config map and dynamically update logging levels.
 	configMapWatcher.Watch(logging.ConfigName, logging.UpdateLevelFromConfigMap(logger, atomicLevel, logging.ControllerLogKey))

diff --git a/pkg/reconciler/timeout_handler.go b/pkg/reconciler/timeout_handler.go
@@ -1,6 +1,8 @@
 package reconciler
 
 import (
+	"math"
+	"math/rand"
 	"sync"
 
 	"time"
@@ -12,45 +14,56 @@ import (
 	"k8s.io/client-go/kubernetes"
 )
 
-var (
-	defaultFunc = func(i interface{}) {}
+const (
+	defaultTimeout    = 10 * time.Minute
+	maxBackoffSeconds = 120
 )
 
-const (
-	defaultTimeout = 10 * time.Minute
+var (
+	defaultFunc        = func(i interface{}) {}
+	maxBackoffExponent = math.Ceil(math.Log2(maxBackoffSeconds))
 )
 
 // StatusKey interface to be implemented by Taskrun Pipelinerun types
 type StatusKey interface {
 	GetRunKey() string
 }
 
+// backoff contains state of exponential backoff for a given StatusKey
+type backoff struct {
+	// NumAttempts reflects the number of times a given StatusKey has been delayed
+	NumAttempts uint
+	// NextAttempt is the point in time at which this backoff expires
+	NextAttempt time.Time
+}
+
+// jitterFunc is a func applied to a computed backoff duration to remove uniformity
+// from its results. A jitterFunc receives the number of seconds calculated by a
+// backoff algorithm and returns the "jittered" result.
+type jitterFunc func(numSeconds int) (jitteredSeconds int)
+
 // TimeoutSet contains required k8s interfaces to handle build timeouts
 type TimeoutSet struct {
 	logger                  *zap.SugaredLogger
-	kubeclientset           kubernetes.Interface
-	pipelineclientset       clientset.Interface
 	taskRunCallbackFunc     func(interface{})
 	pipelineRunCallbackFunc func(interface{})
 	stopCh                  <-chan struct{}
 	done                    map[string]chan bool
 	doneMut                 sync.Mutex
+	backoffs                map[string]backoff
+	backoffsMut             sync.Mutex
 }
 
 // NewTimeoutHandler returns TimeoutSet filled structure
 func NewTimeoutHandler(
-	kubeclientset kubernetes.Interface,
-	pipelineclientset clientset.Interface,
 	stopCh <-chan struct{},
 	logger *zap.SugaredLogger,
 ) *TimeoutSet {
 	return &TimeoutSet{
-		kubeclientset:     kubeclientset,
-		pipelineclientset: pipelineclientset,
-		stopCh:            stopCh,
-		done:              make(map[string]chan bool),
-		doneMut:           sync.Mutex{},
-		logger:            logger,
+		stopCh:   stopCh,
+		done:     make(map[string]chan bool),
+		backoffs: make(map[string]backoff),
+		logger:   logger,
 	}
 }
 
@@ -64,16 +77,20 @@ func (t *TimeoutSet) SetPipelineRunCallbackFunc(f func(interface{})) {
 	t.pipelineRunCallbackFunc = f
 }
 
-// Release function deletes key from timeout map
+// Release deletes channels and data that are specific to a StatusKey object.
 func (t *TimeoutSet) Release(runObj StatusKey) {
 	key := runObj.GetRunKey()
 	t.doneMut.Lock()
 	defer t.doneMut.Unlock()
 
+	t.backoffsMut.Lock()
+	defer t.backoffsMut.Unlock()
+
 	if finished, ok := t.done[key]; ok {
 		delete(t.done, key)
 		close(finished)
 	}
+	delete(t.backoffs, key)
 }
 
 func (t *TimeoutSet) getOrCreateFinishedChan(runObj StatusKey) chan bool {
@@ -101,10 +118,52 @@ func GetTimeout(d *metav1.Duration) time.Duration {
 	return timeout
 }
 
+// GetBackoff records the number of times it has seen a TaskRun and calculates an
+// appropriate backoff deadline based on that count. Only one backoff per TaskRun
+// may be active at any moment. Requests for a new backoff in the face of an
+// existing one will be ignored and details of the existing backoff will be returned
+// instead. Further, if a calculated backoff time is after the timeout of the TaskRun
+// then the time of the timeout will be returned instead.
+//
+// Returned values are a backoff struct containing a NumAttempts field with the
+// number of attempts performed for this TaskRun and a NextAttempt field
+// describing the time at which the next attempt should be performed.
+// Additionally a boolean is returned indicating whether a backoff for the
+// TaskRun is already in progress.
+func (t *TimeoutSet) GetBackoff(tr *v1alpha1.TaskRun) (backoff, bool) {
+	t.backoffsMut.Lock()
+	defer t.backoffsMut.Unlock()
+	b := t.backoffs[tr.GetRunKey()]
+	if time.Now().Before(b.NextAttempt) {
+		return b, true
+	}
+	b.NumAttempts += 1
+	b.NextAttempt = time.Now().Add(backoffDuration(b.NumAttempts, rand.Intn))
+	timeoutDeadline := tr.Status.StartTime.Time.Add(GetTimeout(tr.Spec.Timeout))
+	if timeoutDeadline.Before(b.NextAttempt) {
+		b.NextAttempt = timeoutDeadline
+	}
+	t.backoffs[tr.GetRunKey()] = b
+	return b, false
+}
+
+func backoffDuration(count uint, jf jitterFunc) time.Duration {
+	exp := float64(count)
+	if exp > maxBackoffExponent {
+		exp = maxBackoffExponent
+	}
+	seconds := int(math.Exp2(exp))
+	jittered := 1 + jf(seconds)
+	if jittered > maxBackoffSeconds {
+		jittered = maxBackoffSeconds
+	}
+	return time.Duration(jittered) * time.Second
+}
+
 // checkPipelineRunTimeouts function creates goroutines to wait for pipelinerun to
 // finish/timeout in a given namespace
-func (t *TimeoutSet) checkPipelineRunTimeouts(namespace string) {
-	pipelineRuns, err := t.pipelineclientset.TektonV1alpha1().PipelineRuns(namespace).List(metav1.ListOptions{})
+func (t *TimeoutSet) checkPipelineRunTimeouts(namespace string, pipelineclientset clientset.Interface) {
+	pipelineRuns, err := pipelineclientset.TektonV1alpha1().PipelineRuns(namespace).List(metav1.ListOptions{})
 	if err != nil {
 		t.logger.Errorf("Can't get pipelinerun list in namespace %s: %s", namespace, err)
 		return
@@ -122,22 +181,22 @@ func (t *TimeoutSet) checkPipelineRunTimeouts(namespace string) {
 
 // CheckTimeouts function iterates through all namespaces and calls corresponding
 // taskrun/pipelinerun timeout functions
-func (t *TimeoutSet) CheckTimeouts() {
-	namespaces, err := t.kubeclientset.CoreV1().Namespaces().List(metav1.ListOptions{})
+func (t *TimeoutSet) CheckTimeouts(kubeclientset kubernetes.Interface, pipelineclientset clientset.Interface) {
+	namespaces, err := kubeclientset.CoreV1().Namespaces().List(metav1.ListOptions{})
 	if err != nil {
 		t.logger.Errorf("Can't get namespaces list: %s", err)
 		return
 	}
 	for _, namespace := range namespaces.Items {
-		t.checkTaskRunTimeouts(namespace.GetName())
-		t.checkPipelineRunTimeouts(namespace.GetName())
+		t.checkTaskRunTimeouts(namespace.GetName(), pipelineclientset)
+		t.checkPipelineRunTimeouts(namespace.GetName(), pipelineclientset)
 	}
 }
 
 // checkTaskRunTimeouts function creates goroutines to wait for pipelinerun to
 // finish/timeout in a given namespace
-func (t *TimeoutSet) checkTaskRunTimeouts(namespace string) {
-	taskruns, err := t.pipelineclientset.TektonV1alpha1().TaskRuns(namespace).List(metav1.ListOptions{})
+func (t *TimeoutSet) checkTaskRunTimeouts(namespace string, pipelineclientset clientset.Interface) {
+	taskruns, err := pipelineclientset.TektonV1alpha1().TaskRuns(namespace).List(metav1.ListOptions{})
 	if err != nil {
 		t.logger.Errorf("Can't get taskrun list in namespace %s: %s", namespace, err)
 		return
@@ -172,26 +231,43 @@ func (t *TimeoutSet) waitRun(runObj StatusKey, timeout time.Duration, startTime
 		t.logger.Errorf("startTime must be specified in order for a timeout to be calculated accurately for %s", runObj.GetRunKey())
 		return
 	}
+	if callback == nil {
+		callback = defaultFunc
+	}
 	runtime := time.Since(startTime.Time)
-	finished := t.getOrCreateFinishedChan(runObj)
-
+	t.logger.Infof("About to start timeout timer for %s. started at %s, timeout is %s, running for %s", runObj.GetRunKey(), startTime.Time, timeout, runtime)
 	defer t.Release(runObj)
+	t.setTimer(runObj, timeout-runtime, callback)
+}
 
-	t.logger.Infof("About to start timeout timer for %s. started at %s, timeout is %s, running for %s", runObj.GetRunKey(), startTime.Time, timeout, runtime)
+// SetTaskRunTimer creates a blocking function for taskrun to wait for
+// 1. Stop signal, 2. TaskRun to complete or 3. a given Duration to elapse.
+//
+// Since the timer's duration is a parameter rather than being tied to
+// the lifetime of the TaskRun no resources are released after the timer
+// fires. It is the caller's responsibility to Release() the TaskRun when
+// work with it has completed.
+func (t *TimeoutSet) SetTaskRunTimer(tr *v1alpha1.TaskRun, d time.Duration) {
+	callback := t.taskRunCallbackFunc
+	if callback == nil {
+		t.logger.Errorf("attempted to set a timer for %q but no task run callback has been assigned", tr.Name)
+		return
+	}
+	t.setTimer(tr, d, callback)
+}
 
+func (t *TimeoutSet) setTimer(runObj StatusKey, timeout time.Duration, callback func(interface{})) {
+	finished := t.getOrCreateFinishedChan(runObj)
+	started := time.Now()
 	select {
 	case <-t.stopCh:
-		t.logger.Infof("Stopping timeout timer for %s", runObj.GetRunKey())
+		t.logger.Infof("stopping timer for %q", runObj.GetRunKey())
 		return
 	case <-finished:
-		t.logger.Infof("%s finished, stopping the timeout timer", runObj.GetRunKey())
+		t.logger.Infof("%q finished, stopping timer", runObj.GetRunKey())
 		return
-	case <-time.After(timeout - runtime):
-		t.logger.Infof("Timeout timer for %s has timed out (started at %s, timeout is %s, running for %s", runObj.GetRunKey(), startTime, timeout, time.Since(startTime.Time))
-		if callback != nil {
-			callback(runObj)
-		} else {
-			defaultFunc(runObj)
-		}
+	case <-time.After(timeout):
+		t.logger.Infof("timer for %q has activated after %s", runObj.GetRunKey(), time.Since(started).String())
+		callback(runObj)
 	}
 }