open-telemetry · djaglowski · Dec 12, 2023 · Nov 29, 2023 · Dec 5, 2023 · Dec 5, 2023
@@ -0,0 +1,27 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: new_component
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: failoverconnector
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: PR provides core logic for failover connector and implements failover for trace signals
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [20766]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext:
+
+# If your change doesn't affect end users or the exported elements of any package,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: []
@@ -4,19 +4,164 @@
 package failoverconnector // import "github.com/open-telemetry/opentelemetry-collector-contrib/connector/failoverconnector"
 
 import (
+	"context"
+	"errors"
+	"time"
+
 	"go.opentelemetry.io/collector/component"
+
+	"github.com/open-telemetry/opentelemetry-collector-contrib/connector/failoverconnector/internal/state"
 )
 
 type consumerProvider[C any] func(...component.ID) (C, error)
 
 type failoverRouter[C any] struct {
 	consumerProvider consumerProvider[C]
 	cfg              *Config
+	pS               *state.PipelineSelector
+	consumers        []C
+	rS               *state.RetryState
 }
 
+var (
+	errNoValidPipeline = errors.New("All provided pipelines return errors")
+	errConsumer        = errors.New("Error registering consumer")
+)
+
 func newFailoverRouter[C any](provider consumerProvider[C], cfg *Config) *failoverRouter[C] {
 	return &failoverRouter[C]{
 		consumerProvider: provider,
 		cfg:              cfg,
+		pS:               state.NewPipelineSelector(len(cfg.PipelinePriority), cfg.MaxRetries),
+		rS:               &state.RetryState{},
+	}
+}
+
+func (f *failoverRouter[C]) getCurrentConsumer() (C, int, bool) {
+	// if currentIndex incremented passed bounds of pipeline list
+	var nilConsumer C
+	idx := f.pS.GetCurrentIndex()
+	if idx >= len(f.cfg.PipelinePriority) {
+		return nilConsumer, -1, false
+	}
+	return f.consumers[idx], idx, true
+}
+
+func (f *failoverRouter[C]) registerConsumers() error {
+	consumers := make([]C, 0)
+	for _, pipelines := range f.cfg.PipelinePriority {
+		newConsumer, err := f.consumerProvider(pipelines...)
+		if err != nil {
+			return errConsumer
+		}
+		consumers = append(consumers, newConsumer)
+	}
+	f.consumers = consumers
+	return nil
+}
+
+func (f *failoverRouter[C]) handlePipelineError(idx int) {
+	// avoids race condition in case of consumeSIGNAL invocations
+	// where index was updated during execution
+	if idx != f.pS.GetCurrentIndex() {
+		return
+	}
+	doRetry := f.pS.IndexIsStable(idx)
+	// UpdatePipelineIndex either increments the pipeline to the next priority
+	// or returns it to the stable
+	f.pS.UpdatePipelineIndex(idx)
+	// if the currentIndex is not the stableIndex, that means the currentIndex is a higher
+	// priority index that was set during a retry, in which case we don't want to start a
+	// new retry goroutine
+	if !doRetry {
+		return
+	}
+	// kill existing retry goroutine if error is from a stable pipeline that failed for the first time
+	ctx, cancel := context.WithCancel(context.Background())
+	f.rS.InvokeCancel()
+	f.rS.UpdateCancelFunc(cancel)
+	f.enableRetry(ctx)
+}
+
+func (f *failoverRouter[C]) enableRetry(ctx context.Context) {
+	go func() {
+		ticker := time.NewTicker(f.cfg.RetryInterval)
+		defer ticker.Stop()
+
+		stableIndex := f.pS.GetStableIndex()
+		var cancelFunc context.CancelFunc
+		// checkContinueRetry checks that any higher priority levels have retries remaining
+		// (have not exceeded their maxRetries)
+		for f.checkContinueRetry(stableIndex) {
+			select {
+			case <-ticker.C:
+				// When the nextRetry interval starts we kill the existing iteration through
+				// the higher priority pipelines if still in progress
+				if cancelFunc != nil {
+					cancelFunc()
+				}
+				cancelFunc = f.handleRetry(ctx, stableIndex)
+			case <-ctx.Done():
+				return
+			}
+		}
+		f.rS.InvokeCancel()
+	}()
+}
+
+// handleRetry is responsible for launching goroutine and returning cancelFunc for context to be called if new
+// interval starts in the middle of the execution
+func (f *failoverRouter[C]) handleRetry(parentCtx context.Context, stableIndex int) context.CancelFunc {
+	retryCtx, cancelFunc := context.WithCancel(parentCtx)
+	go f.retryHighPriorityPipelines(retryCtx, stableIndex)
+	return cancelFunc
+}
+
+// retryHighPriorityPipelines responsible for single iteration through all higher priority pipelines
+func (f *failoverRouter[C]) retryHighPriorityPipelines(ctx context.Context, stableIndex int) {
+	ticker := time.NewTicker(f.cfg.RetryGap)
+
+	defer ticker.Stop()
+
+	for i := 0; i < stableIndex; i++ {
+		// if stableIndex was updated to a higher priority level during the execution of the goroutine
+		// will return to avoid overwriting higher priority level with lower one
+		if stableIndex > f.pS.GetStableIndex() {
+			return
+		}
+		// checks that max retries were not used for this index
+		if f.pS.MaxRetriesUsed(i) {
+			continue
+		}
+		select {
+		// return when context is cancelled by parent goroutine
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			// when ticker triggers currentIndex is updated
+			f.pS.SetToRetryIndex(i)
+		}
+	}
+}
+
+// checkStopRetry checks if retry should be suspended if all higher priority levels have exceeded their max retries
+func (f *failoverRouter[C]) checkContinueRetry(index int) bool {
+	for i := 0; i < index; i++ {
+		if f.pS.PipelineRetries[i] < f.cfg.MaxRetries {
+			return true
+		}
+	}
+	return false
+}
+
+// reportStable reports back to the failoverRouter that the current priority level that was called by Consume.SIGNAL was
+// stable
+func (f *failoverRouter[C]) reportStable(idx int) {
+	// is stableIndex is already the known stableIndex return
+	if f.pS.IndexIsStable(idx) {
+		return
 	}
+	// if the stableIndex is a retried index, the update the stable index to the retried index
+	// NOTE retry will not stop due to potential higher priority index still available
+	f.pS.SetNewStableIndex(idx)
 }
@@ -0,0 +1,101 @@
+// Copyright The OpenTelemetry Authors
+// SPDX-License-Identifier: Apache-2.0
+
+package state // import "github.com/open-telemetry/opentelemetry-collector-contrib/connector/failoverconnector/internal/state"
+
+import (
+	"sync"
+)
+
+// PipelineSelector is meant to serve as the source of truth for the target priority level
+type PipelineSelector struct {
+	CurrentIndex    int
+	StableIndex     int
+	lock            sync.RWMutex
+	PipelineRetries []int
+	maxRetry        int
+}
+
+// UpdatePipelineIndex is the main function that updates the pipeline indexes due to an error
+// if the currentIndex is not the stableIndex, that means the currentIndex is a higher
+// priority index that was set during a retry, in which case we return to the stable index
+func (p *PipelineSelector) UpdatePipelineIndex(idx int) {
+	if p.IndexIsStable(idx) {
+		p.SetToNextPriorityPipeline()
+	}
+	p.SetToStableIndex()
+}
+
+// NextPipeline skips through any lower priority pipelines that have exceeded their maxRetries
+// and sets the first that has not as the new stable
+func (p *PipelineSelector) SetToNextPriorityPipeline() {
+	p.lock.Lock()
+	defer p.lock.Unlock()
+	for ok := true; ok; ok = p.exceededMaxRetries() {
+		p.CurrentIndex++
+	}
+	p.StableIndex = p.CurrentIndex
+}
+
+func (p *PipelineSelector) exceededMaxRetries() bool {
+	return p.CurrentIndex < len(p.PipelineRetries) && (p.PipelineRetries[p.CurrentIndex] >= p.maxRetry)
+}
+
+// SetToStableIndex returns the CurrentIndex to the known Stable Index
+func (p *PipelineSelector) SetToStableIndex() {
+	p.lock.Lock()
+	defer p.lock.Unlock()
+	p.PipelineRetries[p.CurrentIndex]++
+	p.CurrentIndex = p.StableIndex
+}
+
+// SetToRetryIndex accepts a param and sets the CurrentIndex to this index value
+func (p *PipelineSelector) SetToRetryIndex(index int) {
+	p.lock.Lock()
+	defer p.lock.Unlock()
+	p.CurrentIndex = index
+}
+
+// MaxRetriesUsed exported access to maxRetriesUsed
+func (p *PipelineSelector) MaxRetriesUsed(index int) bool {
+	p.lock.RLock()
+	defer p.lock.RUnlock()
+	return p.PipelineRetries[index] >= p.maxRetry
+}
+
+// SetNewStableIndex Update stableIndex to the passed stable index
+func (p *PipelineSelector) SetNewStableIndex(idx int) {
+	p.lock.Lock()
+	defer p.lock.RUnlock()
+	p.PipelineRetries[p.CurrentIndex] = 0
+	p.StableIndex = idx
+}
+
+// IndexIsStable returns if index passed is the stable index
+func (p *PipelineSelector) IndexIsStable(idx int) bool {
+	p.lock.RLock()
+	defer p.lock.RUnlock()
+	return idx == p.StableIndex
+}
+
+func (p *PipelineSelector) GetStableIndex() int {
+	p.lock.RLock()
+	defer p.lock.RUnlock()
+	return p.StableIndex
+}
+
+func (p *PipelineSelector) GetCurrentIndex() int {
+	p.lock.RLock()
+	defer p.lock.RUnlock()
+	return p.CurrentIndex
+}
+
+func NewPipelineSelector(lenPriority int, maxRetries int) *PipelineSelector {
+	return &PipelineSelector{
+		CurrentIndex:    0,
+		StableIndex:     0,
+		lock:            sync.RWMutex{},
+		PipelineRetries: make([]int, lenPriority),
+		maxRetry:        maxRetries,
+	}
+}
@@ -0,0 +1,56 @@
+// Copyright The OpenTelemetry Authors
+// SPDX-License-Identifier: Apache-2.0
+
+package state // import "github.com/open-telemetry/opentelemetry-collector-contrib/connector/failoverconnector/internal/state"
+
+import (
+	"context"
+	"sync"
+)
+
+type TryLock struct {
+	lock chan struct{}
+}
+
+func NewTryLock() *TryLock {
+	return &TryLock{
+		lock: make(chan struct{}, 1),
+	}
+}
+
+// Lock tries to write to a channel of size 1 to maintain a single access point to a resource
+// if not default case will return
+// NOTE: may need to update logic in future so that concurrent calls to consume<SIGNAL> block while the lock is acquired
+// and then return automatically once lock is released to avoid repeated calls to consume<SIGNAL> before indexes are updated
+func (tl *TryLock) Lock(fn func(int), arg int) {
+	select {
+	case tl.lock <- struct{}{}:
+		defer tl.Unlock()
+		fn(arg)
+	default:
+	}
+}
+
+func (tl *TryLock) Unlock() {
+	<-tl.lock
+}
+
+// Manages cancel function for retry goroutine, ends up cleaner than using channels
+type RetryState struct {
+	lock        sync.Mutex
+	cancelRetry context.CancelFunc
+}
+
+func (m *RetryState) UpdateCancelFunc(newCancelFunc context.CancelFunc) {
+	m.lock.Lock()
+	defer m.lock.Unlock()
+	m.cancelRetry = newCancelFunc
+}
+
+func (m *RetryState) InvokeCancel() {
+	m.lock.Lock()
+	defer m.lock.Unlock()
+	if m.cancelRetry != nil {
+		m.cancelRetry()
+	}
+}