From 4cc595f1545133182c85572c1131db5164cd66f1 Mon Sep 17 00:00:00 2001 From: Swetha Repakula Date: Thu, 20 Aug 2020 11:50:32 -0700 Subject: [PATCH 1/2] Add custom named neg as feature for usage metrics --- pkg/metrics/features.go | 1 + pkg/metrics/metrics.go | 2 ++ pkg/metrics/metrics_test.go | 43 +++++++++++++++++++++++++++---------- pkg/metrics/types.go | 2 ++ pkg/neg/controller.go | 5 +++-- 5 files changed, 40 insertions(+), 13 deletions(-) diff --git a/pkg/metrics/features.go b/pkg/metrics/features.go index 8f8b052e1c..991de0b88c 100644 --- a/pkg/metrics/features.go +++ b/pkg/metrics/features.go @@ -89,6 +89,7 @@ const ( vmIpNeg = feature("VmIpNEG") vmIpNegLocal = feature("VmIpNegLocal") vmIpNegCluster = feature("VmIpNegCluster") + customNamedNeg = feature("CustomNamedNEG") l4ILBService = feature("L4ILBService") l4ILBGlobalAccess = feature("L4ILBGlobalAccess") diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index d6410e0ff3..2637c726e7 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -286,6 +286,7 @@ func (im *ControllerMetrics) computeNegMetrics() map[feature]int { vmIpNeg: 0, vmIpNegLocal: 0, vmIpNegCluster: 0, + customNamedNeg: 0, } for key, negState := range im.negMap { @@ -295,6 +296,7 @@ func (im *ControllerMetrics) computeNegMetrics() map[feature]int { counts[ingressNeg] += negState.IngressNeg counts[asmNeg] += negState.AsmNeg counts[neg] += negState.AsmNeg + negState.StandaloneNeg + negState.IngressNeg + counts[customNamedNeg] += negState.CustomNamedNeg if negState.VmIpNeg != nil { counts[neg] += 1 counts[vmIpNeg] += 1 diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go index 321f2ef555..6416088d46 100644 --- a/pkg/metrics/metrics_test.go +++ b/pkg/metrics/metrics_test.go @@ -789,12 +789,13 @@ func TestComputeNegMetrics(t *testing.T) { vmIpNeg: 0, vmIpNegLocal: 0, vmIpNegCluster: 0, + customNamedNeg: 0, }, }, { "one neg service", []NegServiceState{ - newNegState(0, 0, 1, nil), + newNegState(0, 0, 1, 0, nil), }, map[feature]int{ standaloneNeg: 0, @@ -804,12 +805,13 @@ func TestComputeNegMetrics(t *testing.T) { vmIpNeg: 0, vmIpNegLocal: 0, vmIpNegCluster: 0, + customNamedNeg: 0, }, }, { "vm primary ip neg in traffic policy cluster mode", []NegServiceState{ - newNegState(0, 0, 1, &VmIpNegType{trafficPolicyLocal: false}), + newNegState(0, 0, 1, 0, &VmIpNegType{trafficPolicyLocal: false}), }, map[feature]int{ standaloneNeg: 0, @@ -819,15 +821,32 @@ func TestComputeNegMetrics(t *testing.T) { vmIpNeg: 1, vmIpNegLocal: 0, vmIpNegCluster: 1, + customNamedNeg: 0, + }, + }, + { + "custom named neg", + []NegServiceState{ + newNegState(1, 0, 0, 1, nil), + }, + map[feature]int{ + standaloneNeg: 1, + ingressNeg: 0, + asmNeg: 0, + neg: 1, + vmIpNeg: 0, + vmIpNegLocal: 0, + vmIpNegCluster: 0, + customNamedNeg: 1, }, }, { "many neg services", []NegServiceState{ - newNegState(0, 0, 1, nil), - newNegState(0, 1, 0, &VmIpNegType{trafficPolicyLocal: false}), - newNegState(5, 0, 0, &VmIpNegType{trafficPolicyLocal: true}), - newNegState(5, 3, 2, nil), + newNegState(0, 0, 1, 0, nil), + newNegState(0, 1, 0, 0, &VmIpNegType{trafficPolicyLocal: false}), + newNegState(5, 0, 0, 0, &VmIpNegType{trafficPolicyLocal: true}), + newNegState(5, 3, 2, 0, nil), }, map[feature]int{ standaloneNeg: 10, @@ -837,6 +856,7 @@ func TestComputeNegMetrics(t *testing.T) { vmIpNeg: 2, vmIpNegLocal: 1, vmIpNegCluster: 1, + customNamedNeg: 0, }, }, } { @@ -856,12 +876,13 @@ func TestComputeNegMetrics(t *testing.T) { } } -func newNegState(standalone, ingress, asm int, negType *VmIpNegType) NegServiceState { +func newNegState(standalone, ingress, asm, customNamed int, negType *VmIpNegType) NegServiceState { return NegServiceState{ - IngressNeg: ingress, - StandaloneNeg: standalone, - AsmNeg: asm, - VmIpNeg: negType, + IngressNeg: ingress, + StandaloneNeg: standalone, + AsmNeg: asm, + VmIpNeg: negType, + CustomNamedNeg: customNamed, } } diff --git a/pkg/metrics/types.go b/pkg/metrics/types.go index 2be3c46ef6..630377450d 100644 --- a/pkg/metrics/types.go +++ b/pkg/metrics/types.go @@ -37,6 +37,8 @@ type NegServiceState struct { AsmNeg int // VmIpNeg specifies if a service uses GCE_VM_IP NEG. VmIpNeg *VmIpNegType + // CustomNamedNeg is the count of standalone negs with custom names + CustomNamedNeg int } // VmIpNegType contains whether a GCE_VM_IP NEG is requesting for diff --git a/pkg/neg/controller.go b/pkg/neg/controller.go index f7ff35ff49..61fc1953b2 100644 --- a/pkg/neg/controller.go +++ b/pkg/neg/controller.go @@ -403,7 +403,7 @@ func (c *Controller) processService(key string) error { return err } negUsage.IngressNeg = len(svcPortInfoMap) - if err := c.mergeStandaloneNEGsPortInfo(service, types.NamespacedName{Namespace: namespace, Name: name}, svcPortInfoMap); err != nil { + if err := c.mergeStandaloneNEGsPortInfo(service, types.NamespacedName{Namespace: namespace, Name: name}, svcPortInfoMap, &negUsage); err != nil { return err } negUsage.StandaloneNeg = len(svcPortInfoMap) - negUsage.IngressNeg @@ -470,7 +470,7 @@ func (c *Controller) mergeIngressPortInfo(service *apiv1.Service, name types.Nam } // mergeStandaloneNEGsPortInfo merge Standalone NEG PortInfo into portInfoMap -func (c *Controller) mergeStandaloneNEGsPortInfo(service *apiv1.Service, name types.NamespacedName, portInfoMap negtypes.PortInfoMap) error { +func (c *Controller) mergeStandaloneNEGsPortInfo(service *apiv1.Service, name types.NamespacedName, portInfoMap negtypes.PortInfoMap, negUsage *usage.NegServiceState) error { negAnnotation, foundNEGAnnotation, err := annotations.FromService(service).NEGAnnotation() if err != nil { return err @@ -502,6 +502,7 @@ func (c *Controller) mergeStandaloneNEGsPortInfo(service *apiv1.Service, name ty if negAnnotation.NEGEnabledForIngress() && len(customNames) != 0 { return fmt.Errorf("configuration for negs in service (%s) is invalid, custom neg name cannot be used with ingress enabled", name.String()) } + negUsage.CustomNamedNeg = len(customNames) if err := portInfoMap.Merge(negtypes.NewPortInfoMap(name.Namespace, name.Name, exposedNegSvcPort, c.namer /*readinessGate*/, true, customNames)); err != nil { return fmt.Errorf("failed to merge service ports exposed as standalone NEGs (%v) into ingress referenced service ports (%v): %v", exposedNegSvcPort, portInfoMap, err) From aba3c6ef95f95e588fa18528b9ce1f1173feabaf Mon Sep 17 00:00:00 2001 From: Swetha Repakula Date: Thu, 20 Aug 2020 12:05:58 -0700 Subject: [PATCH 2/2] Add internal neg controller metrics - remove `glbc` namespace to existing ingress metrics - change last sync timestamp metric to gauge --- pkg/neg/controller.go | 6 +- pkg/neg/manager.go | 9 ++- pkg/neg/metrics/metrics.go | 129 +++++++++++++++++++++++++++------ pkg/neg/syncers/transaction.go | 16 +++- 4 files changed, 129 insertions(+), 31 deletions(-) diff --git a/pkg/neg/controller.go b/pkg/neg/controller.go index 61fc1953b2..a2162a9cdf 100644 --- a/pkg/neg/controller.go +++ b/pkg/neg/controller.go @@ -333,7 +333,7 @@ func (c *Controller) nodeWorker() { func (c *Controller) processNode() { defer func() { now := c.nodeSyncTracker.Track() - metrics.LastSyncTimestamp.WithLabelValues().Set(float64(now.UTC().UnixNano())) + metrics.LastSyncTimestamp.Set(float64(now.UTC().UnixNano())) }() c.manager.SyncNodes() } @@ -342,7 +342,7 @@ func (c *Controller) processNode() { func (c *Controller) processEndpoint(key string) { defer func() { now := c.syncTracker.Track() - metrics.LastSyncTimestamp.WithLabelValues().Set(float64(now.UTC().UnixNano())) + metrics.LastSyncTimestamp.Set(float64(now.UTC().UnixNano())) }() namespace, name, err := cache.SplitMetaNamespaceKey(key) @@ -371,7 +371,7 @@ func (c *Controller) serviceWorker() { func (c *Controller) processService(key string) error { defer func() { now := c.syncTracker.Track() - metrics.LastSyncTimestamp.WithLabelValues().Set(float64(now.UTC().UnixNano())) + metrics.LastSyncTimestamp.Set(float64(now.UTC().UnixNano())) }() namespace, name, err := cache.SplitMetaNamespaceKey(key) diff --git a/pkg/neg/manager.go b/pkg/neg/manager.go index 7bce9aae2b..d8e48a66dc 100644 --- a/pkg/neg/manager.go +++ b/pkg/neg/manager.go @@ -21,6 +21,7 @@ import ( "fmt" "reflect" "sync" + "time" "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud" "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud/meta" @@ -36,6 +37,7 @@ import ( "k8s.io/client-go/tools/record" negv1beta1 "k8s.io/ingress-gce/pkg/apis/svcneg/v1beta1" "k8s.io/ingress-gce/pkg/flags" + "k8s.io/ingress-gce/pkg/neg/metrics" "k8s.io/ingress-gce/pkg/neg/readiness" negsyncer "k8s.io/ingress-gce/pkg/neg/syncers" negtypes "k8s.io/ingress-gce/pkg/neg/types" @@ -108,6 +110,7 @@ func newSyncerManager(namer negtypes.NetworkEndpointGroupNamer, recorder record. func (manager *syncerManager) EnsureSyncers(namespace, name string, newPorts negtypes.PortInfoMap) error { manager.mu.Lock() defer manager.mu.Unlock() + start := time.Now() key := getServiceKey(namespace, name) currentPorts, ok := manager.svcPortMap[key] if !ok { @@ -177,7 +180,9 @@ func (manager *syncerManager) EnsureSyncers(namespace, name string, newPorts neg } } } - return utilerrors.NewAggregate(errList) + err := utilerrors.NewAggregate(errList) + metrics.PublishNegManagerProcessMetrics(metrics.SyncProcess, err, start) + return err } // StopSyncer stops all syncers for the input service. @@ -236,6 +241,7 @@ func (manager *syncerManager) ShutDown() { func (manager *syncerManager) GC() error { klog.V(2).Infof("Start NEG garbage collection.") defer klog.V(2).Infof("NEG garbage collection finished.") + start := time.Now() // Garbage collect Syncers manager.garbageCollectSyncer() @@ -249,6 +255,7 @@ func (manager *syncerManager) GC() error { if err != nil { err = fmt.Errorf("failed to garbage collect negs: %v", err) } + metrics.PublishNegManagerProcessMetrics(metrics.GCProcess, err, start) return err } diff --git a/pkg/neg/metrics/metrics.go b/pkg/neg/metrics/metrics.go index bd765e8cde..f2b772225c 100644 --- a/pkg/neg/metrics/metrics.go +++ b/pkg/neg/metrics/metrics.go @@ -21,48 +21,101 @@ import ( "time" "github.com/prometheus/client_golang/prometheus" - "k8s.io/ingress-gce/pkg/metrics" ) const ( - negControllerSubsystem = "neg_controller" - syncLatencyKey = "neg_sync_duration_seconds" - lastSyncTimestampKey = "sync_timestamp" + negControllerSubsystem = "neg_controller" + syncerLatencyKey = "syncer_sync_duration_seconds" + managerProcessLatencyKey = "manager_process_duration_seconds" + initLatencyKey = "neg_initialization_duration_seconds" + negOpLatencyKey = "endpoint_operation_duration_seconds" + negOpEndpointsKey = "neg_operation_endpoints" + lastSyncTimestampKey = "sync_timestamp" resultSuccess = "success" resultError = "error" - AttachSync = syncType("attach") - DetachSync = syncType("detach") + GCProcess = "GC" + SyncProcess = "Sync" ) type syncType string var ( - syncMetricsLabels = []string{ - "key", // The key to uniquely identify the NEG syncer. - "type", // Type of the NEG sync - "result", // Result of the sync. + negOpLatencyMetricsLabels = []string{ + "operation", // endpoint operation + "neg_type", // type of neg + "api_version", // GCE API version + "result", // result of the sync } - SyncLatency = prometheus.NewHistogramVec( + negOpEndpointsMetricsLabels = []string{ + "operation", // endpoint operation + "neg_type", // type of neg + "result", // result of the sync + } + + negProcessMetricsLabels = []string{ + "process", // type of manager process loop + "result", // result of the process + } + + syncerMetricsLabels = []string{ + "neg_type", //type of neg + "endpoint_calculator_mode", // type of endpoint calculator used + "result", // result of the sync + } + + NegOperationLatency = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: negControllerSubsystem, + Name: negOpLatencyKey, + Help: "Latency of a NEG Operation", + }, + negOpLatencyMetricsLabels, + ) + + NegOperationEndpoints = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: negControllerSubsystem, + Name: negOpEndpointsKey, + Help: "Number of Endpoints during an NEG Operation", + }, + negOpEndpointsMetricsLabels, + ) + + SyncerSyncLatency = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: negControllerSubsystem, + Name: syncerLatencyKey, + Help: "Sync latency for NEG Syncer", + }, + syncerMetricsLabels, + ) + + ManagerProcessLatency = prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Namespace: metrics.GLBC_NAMESPACE, Subsystem: negControllerSubsystem, - Name: syncLatencyKey, - Help: "Sync latency of a NEG syncer", + Name: managerProcessLatencyKey, + Help: "Process latency for NEG Manager", }, - syncMetricsLabels, + negProcessMetricsLabels, ) - LastSyncTimestamp = prometheus.NewGaugeVec( + InitializationLatency = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Subsystem: negControllerSubsystem, + Name: initLatencyKey, + Help: "Initialization latency of a NEG", + }, + ) + + LastSyncTimestamp = prometheus.NewGauge( prometheus.GaugeOpts{ - Namespace: metrics.GLBC_NAMESPACE, Subsystem: negControllerSubsystem, Name: lastSyncTimestampKey, Help: "The timestamp of the last execution of NEG controller sync loop.", }, - []string{}, ) ) @@ -70,16 +123,44 @@ var register sync.Once func RegisterMetrics() { register.Do(func() { - prometheus.MustRegister(SyncLatency) + prometheus.MustRegister(NegOperationLatency) + prometheus.MustRegister(NegOperationEndpoints) + prometheus.MustRegister(ManagerProcessLatency) + prometheus.MustRegister(SyncerSyncLatency) prometheus.MustRegister(LastSyncTimestamp) + prometheus.MustRegister(InitializationLatency) }) } -// ObserveNegSync publish collected metrics for the sync of NEG -func ObserveNegSync(negName string, syncType syncType, err error, start time.Time) { - result := resultSuccess +// PublishNegOperationMetrics publishes collected metrics for neg operations +func PublishNegOperationMetrics(operation, negType, apiVersion string, err error, numEndpoints int, start time.Time) { + result := getResult(err) + + NegOperationLatency.WithLabelValues(operation, negType, apiVersion, result).Observe(time.Since(start).Seconds()) + NegOperationEndpoints.WithLabelValues(operation, negType, result).Observe(float64(numEndpoints)) +} + +// PublishNegSyncMetrics publishes collected metrics for the sync of NEG +func PublishNegSyncMetrics(negType, endpointCalculator string, err error, start time.Time) { + result := getResult(err) + + SyncerSyncLatency.WithLabelValues(negType, endpointCalculator, result).Observe(time.Since(start).Seconds()) +} + +// PublishNegManagerProcessMetrics publishes collected metrics for the neg manager loops +func PublishNegManagerProcessMetrics(process string, err error, start time.Time) { + result := getResult(err) + ManagerProcessLatency.WithLabelValues(process, result).Observe(time.Since(start).Seconds()) +} + +// PublishNegInitializationMetrics publishes collected metrics for time from request to initialization of NEG +func PublishNegInitializationMetrics(latency time.Duration) { + InitializationLatency.Observe(latency.Seconds()) +} + +func getResult(err error) string { if err != nil { - result = resultError + return resultError } - SyncLatency.WithLabelValues(negName, string(syncType), result).Observe(time.Since(start).Seconds()) + return resultSuccess } diff --git a/pkg/neg/syncers/transaction.go b/pkg/neg/syncers/transaction.go index d0bc0d04d7..f9bb4d2f77 100644 --- a/pkg/neg/syncers/transaction.go +++ b/pkg/neg/syncers/transaction.go @@ -19,6 +19,7 @@ package syncers import ( "context" "sync" + "time" "fmt" @@ -33,6 +34,7 @@ import ( "k8s.io/client-go/tools/record" negv1beta1 "k8s.io/ingress-gce/pkg/apis/svcneg/v1beta1" "k8s.io/ingress-gce/pkg/composite" + "k8s.io/ingress-gce/pkg/neg/metrics" "k8s.io/ingress-gce/pkg/neg/readiness" negtypes "k8s.io/ingress-gce/pkg/neg/types" svcnegclient "k8s.io/ingress-gce/pkg/svcneg/client/clientset/versioned" @@ -137,10 +139,13 @@ func (s *transactionSyncer) sync() error { func (s *transactionSyncer) syncInternal() error { s.syncLock.Lock() defer s.syncLock.Unlock() + // NOTE: Error will be used to update the status on corresponding Neg CR if Neg CRD is enabled // Please reuse and set err before returning var err error defer s.updateStatus(err) + start := time.Now() + defer metrics.PublishNegSyncMetrics(string(s.NegSyncerKey.NegType), string(s.endpointsCalculator.Mode()), err, start) if s.needInit { if err := s.ensureNetworkEndpointGroups(); err != nil { @@ -320,6 +325,7 @@ func (s *transactionSyncer) detachNetworkEndpoints(zone string, networkEndpointM // If error occurs or any transaction entry requires reconciliation, it will trigger resync func (s *transactionSyncer) operationInternal(operation transactionOp, zone string, networkEndpointMap map[negtypes.NetworkEndpoint]*composite.NetworkEndpoint) { var err error + start := time.Now() networkEndpoints := []*composite.NetworkEndpoint{} for _, ne := range networkEndpointMap { networkEndpoints = append(networkEndpoints, ne) @@ -340,6 +346,7 @@ func (s *transactionSyncer) operationInternal(operation transactionOp, zone stri // WARNING: commitTransaction must be called at last for analyzing the operation result s.commitTransaction(err, networkEndpointMap) + metrics.PublishNegOperationMetrics(operation.String(), string(s.NegSyncerKey.NegType), string(s.NegSyncerKey.GetAPIVersion()), err, len(networkEndpointMap), start) } func (s *transactionSyncer) recordEvent(eventType, reason, eventDesc string) { @@ -484,7 +491,9 @@ func (s *transactionSyncer) updateInitStatus(negObjRefs []negv1beta1.NegObjectRe neg.Status.NetworkEndpointGroups = negObjRefs } - ensureCondition(neg, getInitializedCondition(utilerrors.NewAggregate(errList))) + initializedCondition := getInitializedCondition(utilerrors.NewAggregate(errList)) + finalCondition := ensureCondition(neg, initializedCondition) + metrics.PublishNegInitializationMetrics(finalCondition.LastTransitionTime.Sub(origNeg.GetCreationTimestamp().Time)) _, err = patchNegStatus(s.svcNegClient, origNeg.Status, neg.Status, s.Namespace, s.NegSyncerKey.NegName) if err != nil { @@ -546,11 +555,11 @@ func patchNegStatus(svcNegClient svcnegclient.Interface, oldStatus, newStatus ne } // ensureCondition will update the condition on the neg object if necessary -func ensureCondition(neg *negv1beta1.ServiceNetworkEndpointGroup, expectedCondition negv1beta1.Condition) { +func ensureCondition(neg *negv1beta1.ServiceNetworkEndpointGroup, expectedCondition negv1beta1.Condition) negv1beta1.Condition { condition, index, exists := findCondition(neg.Status.Conditions, expectedCondition.Type) if !exists { neg.Status.Conditions = append(neg.Status.Conditions, expectedCondition) - return + return negv1beta1.Condition{} } if condition.Status == expectedCondition.Status { @@ -558,6 +567,7 @@ func ensureCondition(neg *negv1beta1.ServiceNetworkEndpointGroup, expectedCondit } neg.Status.Conditions[index] = expectedCondition + return expectedCondition } // getSyncedCondition returns the expected synced condition based on given error