Skip to content

Commit

Permalink
UPSTREAM: 99095: Prevent Kubelet stuck in DiskPressure when imagefs
Browse files Browse the repository at this point in the history
  • Loading branch information
rphillips authored and soltysh committed Sep 8, 2021
1 parent 828499b commit a85e535
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 2 deletions.
5 changes: 3 additions & 2 deletions pkg/kubelet/eviction/eviction_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -442,8 +442,9 @@ func (m *managerImpl) reclaimNodeLevelResources(signalToReclaim evictionapi.Sign
observations, _ := makeSignalObservations(summary)
debugLogObservations("observations after resource reclaim", observations)

// determine the set of thresholds met independent of grace period
thresholds := thresholdsMet(m.config.Thresholds, observations, false)
// evaluate all thresholds independently of their grace period to see if with
// the new observations, we think we have met min reclaim goals
thresholds := thresholdsMet(m.config.Thresholds, observations, true)
debugLogThresholdsWithObservation("thresholds after resource reclaim - ignoring grace period", thresholds, observations)

if len(thresholds) == 0 {
Expand Down
45 changes: 45 additions & 0 deletions pkg/kubelet/eviction/eviction_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -886,6 +886,51 @@ func TestNodeReclaimFuncs(t *testing.T) {
t.Errorf("Manager should not report disk pressure")
}

// synchronize
manager.synchronize(diskInfoProvider, activePodsFunc)

// we should not have disk pressure
if manager.IsUnderDiskPressure() {
t.Errorf("Manager should not report disk pressure")
}

// induce hard threshold
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker(".9Gi", "200Gi", podStats)
// make GC return disk usage bellow the threshold, but not satisfying minReclaim
diskGC.summaryAfterGC = summaryStatsMaker("1.1Gi", "200Gi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)

// we should have disk pressure
if !manager.IsUnderDiskPressure() {
t.Errorf("Manager should report disk pressure since soft threshold was met")
}

// verify image gc was invoked
if !diskGC.imageGCInvoked || !diskGC.containerGCInvoked {
t.Errorf("Manager should have invoked image gc")
}

// verify a pod was killed because image gc was not enough to satisfy minReclaim
if podKiller.pod == nil {
t.Errorf("Manager should have killed a pod, but didn't")
}

// reset state
diskGC.imageGCInvoked = false
diskGC.containerGCInvoked = false
podKiller.pod = nil

// remove disk pressure
fakeClock.Step(20 * time.Minute)
summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)

// we should not have disk pressure
if manager.IsUnderDiskPressure() {
t.Errorf("Manager should not report disk pressure")
}

// induce disk pressure!
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("400Mi", "200Gi", podStats)
Expand Down

0 comments on commit a85e535

Please sign in to comment.