Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filter out unhealthy workloads #695

Merged
merged 1 commit into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 31 additions & 16 deletions pkg/controller/workload/workload_processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,18 +144,27 @@
return nil
}

func (p *Processor) removeWorkloadResource(removedResources []string) error {
func (p *Processor) removeWorkloadResources(removedResources []string) error {
for _, uid := range removedResources {
wl := p.WorkloadCache.GetWorkloadByUid(uid)
p.WorkloadCache.DeleteWorkload(uid)
telemetry.DeleteWorkloadMetric(wl)
if err := p.removeWorkloadFromBpfMap(uid); err != nil {
return err
err := p.removeWorkload(uid)
if err != nil {
log.Warnf("removeWorkload %s failed: %v", uid, err)
continue

Check warning on line 152 in pkg/controller/workload/workload_processor.go

View check run for this annotation

Codecov / codecov/patch

pkg/controller/workload/workload_processor.go#L149-L152

Added lines #L149 - L152 were not covered by tests
}
}
return nil
}

func (p *Processor) removeWorkload(uid string) error {
wl := p.WorkloadCache.GetWorkloadByUid(uid)
if wl == nil {
return nil

Check warning on line 161 in pkg/controller/workload/workload_processor.go

View check run for this annotation

Codecov / codecov/patch

pkg/controller/workload/workload_processor.go#L158-L161

Added lines #L158 - L161 were not covered by tests
}
p.WorkloadCache.DeleteWorkload(uid)
telemetry.DeleteWorkloadMetric(wl)
return p.removeWorkloadFromBpfMap(uid)

Check warning on line 165 in pkg/controller/workload/workload_processor.go

View check run for this annotation

Codecov / codecov/patch

pkg/controller/workload/workload_processor.go#L163-L165

Added lines #L163 - L165 were not covered by tests
}

func (p *Processor) removeWorkloadFromBpfMap(uid string) error {
var (
err error
Expand Down Expand Up @@ -220,7 +229,7 @@
return nil
}

func (p *Processor) removeServiceResource(resources []string) error {
func (p *Processor) removeServiceResources(resources []string) error {
for _, name := range resources {
telemetry.DeleteServiceMetric(name)
svc := p.ServiceCache.GetService(name)
Expand Down Expand Up @@ -368,10 +377,18 @@
}

func (p *Processor) handleWorkload(workload *workloadapi.Workload) error {
log.Debugf("handle workload: %s", workload.Uid)
log.Debugf("handle workload: %s", workload.ResourceName())

// Keep track of the workload no matter it is healthy, unhealthy workload is just for debugging
p.WorkloadCache.AddOrUpdateWorkload(workload)

// Exclude unhealthy workload, which is not ready to serve traffic
if workload.Status == workloadapi.WorkloadStatus_UNHEALTHY {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not correct, say if the workload turns to unhealthy from healthy. It will still exist in the bpf map

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fix depends on some thing in #783, so will continue after that merged

log.Debugf("workload %s is unhealthy", workload.ResourceName())
// If the workload is updated to unhealthy, we should remove it from the bpf map
return p.removeWorkloadFromBpfMap(workload.Uid)
}

unboundedEndpointKeys, newServices := p.compareWorkloadServices(workload)
if err := p.handleWorkloadUnboundServices(workload, unboundedEndpointKeys); err != nil {
log.Errorf("handleWorkloadUnboundServices %s failed: %v", workload.ResourceName(), err)
Expand Down Expand Up @@ -527,10 +544,10 @@
}
}

if err := p.removeWorkloadResource(workloadNames); err != nil {
log.Errorf("RemoveWorkloadResource failed: %v", err)
if err := p.removeWorkloadResources(workloadNames); err != nil {
log.Errorf("removeWorkloadResources failed: %v", err)

Check warning on line 548 in pkg/controller/workload/workload_processor.go

View check run for this annotation

Codecov / codecov/patch

pkg/controller/workload/workload_processor.go#L548

Added line #L548 was not covered by tests
}
if err := p.removeServiceResource(serviceNames); err != nil {
if err := p.removeServiceResources(serviceNames); err != nil {
log.Errorf("RemoveServiceResource failed: %v", err)
}
}
Expand All @@ -552,21 +569,19 @@
case *workloadapi.Address_Service:
services = append(services, address.GetService())
default:
log.Errorf("unknown type")
log.Errorf("unknown type, should not reach here")

Check warning on line 572 in pkg/controller/workload/workload_processor.go

View check run for this annotation

Codecov / codecov/patch

pkg/controller/workload/workload_processor.go#L572

Added line #L572 was not covered by tests
}
}

for _, service := range services {
log.Debugf("handle service %v", service.ResourceName())
if err = p.handleService(service); err != nil {
log.Errorf("handle service failed, err: %v", err)
log.Errorf("handle service %v failed, err: %v", service.ResourceName(), err)

Check warning on line 578 in pkg/controller/workload/workload_processor.go

View check run for this annotation

Codecov / codecov/patch

pkg/controller/workload/workload_processor.go#L578

Added line #L578 was not covered by tests
}
}

for _, workload := range workloads {
log.Debugf("handle workload %v", workload.ResourceName())
if err = p.handleWorkload(workload); err != nil {
log.Errorf("handle workload failed, err: %v", err)
log.Errorf("handle workload %s failed, err: %v", workload.ResourceName(), err)

Check warning on line 584 in pkg/controller/workload/workload_processor.go

View check run for this annotation

Codecov / codecov/patch

pkg/controller/workload/workload_processor.go#L584

Added line #L584 was not covered by tests
}
}

Expand Down
37 changes: 29 additions & 8 deletions pkg/controller/workload/workload_processor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,12 @@ func Test_handleWorkload(t *testing.T) {
_ = p.handleService(fakeSvc)

// 2. add workload
wl := createTestWorkloadWithService(true)
err := p.handleWorkload(wl)
workload1 := createTestWorkloadWithService(true)
err := p.handleWorkload(workload1)
assert.NoError(t, err)

workloadID := checkFrontEndMap(t, wl.Addresses[0], p)
checkBackendMap(t, p, workloadID, wl)
workloadID := checkFrontEndMap(t, workload1.Addresses[0], p)
checkBackendMap(t, p, workloadID, workload1)

// 2.1 check front end map contains service
svcID := checkFrontEndMap(t, fakeSvc.Addresses[0].Address, p)
Expand Down Expand Up @@ -118,9 +118,9 @@ func Test_handleWorkload(t *testing.T) {
checkBackendMap(t, p, workload2ID, workload2)

// 5 update workload to remove the bound services
wl3 := proto.Clone(wl).(*workloadapi.Workload)
wl3.Services = nil
err = p.handleWorkload(wl3)
workload1Updated := proto.Clone(workload1).(*workloadapi.Workload)
workload1Updated.Services = nil
err = p.handleWorkload(workload1Updated)
assert.NoError(t, err)

// 5.1 check service map
Expand All @@ -142,7 +142,28 @@ func Test_handleWorkload(t *testing.T) {
// 6.2 check service map contains service, but no waypoint address
checkServiceMap(t, p, svcID, wpSvc, 0)

// 7. delete service
// 7. test add unhealthy workload
workload3 := createFakeWorkload("1.2.3.7", workloadapi.NetworkMode_STANDARD)
workload3.Status = workloadapi.WorkloadStatus_UNHEALTHY
_ = p.handleWorkload(workload3)

addr, _ := netip.AddrFromSlice(workload3.Addresses[0])
networkAddress := cache.NetworkAddress{
Network: workload3.Network,
Address: addr,
}
got := p.WorkloadCache.GetWorkloadByAddr(networkAddress)
t.Logf("workload3: %v", got)
assert.NotNil(t, got)
assert.Equal(t, got.Status, workloadapi.WorkloadStatus_UNHEALTHY)
checkNotExistInFrontEndMap(t, workload3.Addresses[0], p)

// 8. update workload from healthy to unhealthy, should remove it from bpf map
workload2.Status = workloadapi.WorkloadStatus_UNHEALTHY
_ = p.handleWorkload(workload2)
checkNotExistInFrontEndMap(t, workload2.Addresses[0], p)

// 9. delete service
p.handleRemovedAddresses([]string{fakeSvc.ResourceName()})
checkNotExistInFrontEndMap(t, fakeSvc.Addresses[0].Address, p)

Expand Down
Loading