Skip to content

Commit

Permalink
feat: Add error checking to CheckResourceStatus (#583)
Browse files Browse the repository at this point in the history
**Reason for Change**:
1. Add error checking to CheckResourceStatus:

- For tuning job, when fail pods detected, change status and don't need
to wait till time limit.
- For inference job, when DeploymentProgressing is failed, change status
and don't need to wait till time limit.

2. Add unit test

**Requirements**

- [ ] added unit tests and e2e tests (if applicable).

**Issue Fixed**:
<!-- If this PR fixes GitHub issue 4321, add "Fixes #4321" to the next
line. -->

**Notes for Reviewers**:

---------

Signed-off-by: Bangqi Zhu <bangqizhu@microsoft.com>
Co-authored-by: Bangqi Zhu <bangqizhu@microsoft.com>
  • Loading branch information
bangqipropel and Bangqi Zhu authored Sep 3, 2024
1 parent f083822 commit cb7b83f
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 8 deletions.
14 changes: 13 additions & 1 deletion pkg/resources/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,14 @@ func CheckResourceStatus(obj client.Object, kubeClient client.Client, timeoutDur

switch k8sResource := obj.(type) {
case *appsv1.Deployment:
for _, condition := range k8sResource.Status.Conditions {
if condition.Type == appsv1.DeploymentProgressing && condition.Status == corev1.ConditionFalse {
errorMessage := fmt.Sprintf("deployment %s is not progressing: %s", k8sResource.Name, condition.Message)
klog.ErrorS(fmt.Errorf(errorMessage), "deployment", k8sResource.Name, "reason", condition.Reason, "message", condition.Message)
return fmt.Errorf(errorMessage)
}
}

if k8sResource.Status.ReadyReplicas == *k8sResource.Spec.Replicas {
klog.InfoS("deployment status is ready", "deployment", k8sResource.Name)
return nil
Expand All @@ -80,7 +88,11 @@ func CheckResourceStatus(obj client.Object, kubeClient client.Client, timeoutDur
return nil
}
case *batchv1.Job:
if k8sResource.Status.Active > 0 || k8sResource.Status.Succeeded > 0 {
if k8sResource.Status.Failed > 0 {
klog.ErrorS(fmt.Errorf("job failed"), "name", k8sResource.Name, "failed count", k8sResource.Status.Failed)
return fmt.Errorf("job %s has failed %d pods", k8sResource.Name, k8sResource.Status.Failed)
}
if k8sResource.Status.Succeeded > 0 || (k8sResource.Status.Ready != nil && *k8sResource.Status.Ready > 0) {
klog.InfoS("job status is active/succeeded", "name", k8sResource.Name)
return nil
}
Expand Down
86 changes: 79 additions & 7 deletions pkg/resources/resources_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@ package resources
import (
"context"
"errors"
"github.com/azure/kaito/pkg/utils/test"
"testing"
"time"

"github.com/azure/kaito/pkg/utils/test"

appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/apps/v1"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"

Expand All @@ -27,7 +28,8 @@ func int32Ptr(i int32) *int32 {

func TestCheckResourceStatus(t *testing.T) {
scheme := runtime.NewScheme()
_ = v1.AddToScheme(scheme)
_ = appsv1.AddToScheme(scheme)
_ = batchv1.AddToScheme(scheme)
t.Run("Should return nil for ready Deployment", func(t *testing.T) {
// Create a deployment object for testing
dep := &appsv1.Deployment{
Expand Down Expand Up @@ -114,6 +116,76 @@ func TestCheckResourceStatus(t *testing.T) {
assert.Error(t, err)
assert.Equal(t, "unsupported resource type", err.Error())
})

t.Run("Should return error when DeploymentProcessing is False", func(t *testing.T) {
dep := &appsv1.Deployment{
Status: appsv1.DeploymentStatus{
ReadyReplicas: 3,
Conditions: []appsv1.DeploymentCondition{
{
Type: appsv1.DeploymentProgressing,
Status: corev1.ConditionFalse,
Reason: "ProcessDeadlineExceeded",
Message: "Deployment exceeded its progress deadline",
},
},
},
Spec: appsv1.DeploymentSpec{
Replicas: int32Ptr(3),
},
}
cl := fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects(dep).Build()
err := CheckResourceStatus(dep, cl, 2*time.Second)
assert.Error(t, err)
assert.Contains(t, err.Error(), "Deployment exceeded its progress deadline")
})

t.Run("Should return error for Job with failed pods", func(t *testing.T) {
job := &batchv1.Job{
Status: batchv1.JobStatus{
Failed: 1,
},
}
cl := fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects(job).Build()
err := CheckResourceStatus(job, cl, 2*time.Second)
assert.Error(t, err)
assert.Contains(t, err.Error(), "has failed 1 pods")
})

t.Run("Should return deadline exceeded for Job with only active pods", func(t *testing.T) {
job := &batchv1.Job{
Status: batchv1.JobStatus{
Active: 1,
},
}
cl := fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects(job).Build()
err := CheckResourceStatus(job, cl, 2*time.Second)
assert.Error(t, err)
assert.Equal(t, err, context.DeadlineExceeded)
})

t.Run("Should return nil for Job with only succeeded pods", func(t *testing.T) {
job := &batchv1.Job{
Status: batchv1.JobStatus{
Succeeded: 1,
},
}
cl := fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects(job).Build()
err := CheckResourceStatus(job, cl, 2*time.Second)
assert.Nil(t, err)
})

t.Run("Should return nil for Job with only ready pods", func(t *testing.T) {
readyCount := int32(1)
job := &batchv1.Job{
Status: batchv1.JobStatus{
Ready: &readyCount,
},
}
cl := fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects(job).Build()
err := CheckResourceStatus(job, cl, 2*time.Second)
assert.Nil(t, err)
})
}

func TestCreateResource(t *testing.T) {
Expand All @@ -124,16 +196,16 @@ func TestCreateResource(t *testing.T) {
}{
"Resource creation fails with Deployment object": {
callMocks: func(c *test.MockClient) {
c.On("Create", mock.IsType(context.Background()), mock.IsType(&v1.Deployment{}), mock.Anything).Return(errors.New("Failed to create resource"))
c.On("Create", mock.IsType(context.Background()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(errors.New("Failed to create resource"))
},
expectedResource: &v1.Deployment{},
expectedResource: &appsv1.Deployment{},
expectedError: errors.New("Failed to create resource"),
},
"Resource creation succeeds with Statefulset object": {
callMocks: func(c *test.MockClient) {
c.On("Create", mock.IsType(context.Background()), mock.IsType(&v1.StatefulSet{}), mock.Anything).Return(nil)
c.On("Create", mock.IsType(context.Background()), mock.IsType(&appsv1.StatefulSet{}), mock.Anything).Return(nil)
},
expectedResource: &v1.StatefulSet{},
expectedResource: &appsv1.StatefulSet{},
expectedError: nil,
},
"Resource creation succeeds with Service object": {
Expand Down

0 comments on commit cb7b83f

Please sign in to comment.