Skip to content

Commit

Permalink
include MetricsUnavailable condition to Complete in Trial
Browse files Browse the repository at this point in the history
It is not easy for users to find why Trial failed when training code output incorrect format logs
since the trial-controller sets Succeeded condition with False to Trial if there are unavailable metrics in Katib DB as described in #1343.
So we also include MetricsUnavailable condition to Complete in Trial.
  • Loading branch information
tenzen-y committed May 28, 2022
1 parent a9d92bd commit 657b316
Show file tree
Hide file tree
Showing 20 changed files with 388 additions and 245 deletions.
1 change: 0 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ ifndef HAS_SETUP_ENVTEST
endif
echo "setup-envtest has already installed"


check: generate fmt vet lint

fmt:
Expand Down
6 changes: 6 additions & 0 deletions pkg/apis/controller/experiments/v1beta1/experiment_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ type ExperimentStatus struct {
// List of trial names which have been early stopped.
EarlyStoppedTrialList []string `json:"earlyStoppedTrialList,omitempty"`

// List of trial names which have been metrics unavailable
MetricsUnavailableTrialList []string `json:"metricsUnavailableTrialList,omitempty"`

// Trials is the total number of trials owned by the experiment.
Trials int32 `json:"trials,omitempty"`

Expand All @@ -120,6 +123,9 @@ type ExperimentStatus struct {

// How many trials are currently early stopped.
TrialsEarlyStopped int32 `json:"trialsEarlyStopped,omitempty"`

// How many trials are currently metrics unavailable.
TrialMetricsUnavailable int32 `json:"trialMetricsUnavailable,omitempty"`
}

// OptimalTrial is the metrics and assignments of the best trial.
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 7 additions & 6 deletions pkg/apis/controller/trials/v1beta1/trial_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,13 @@ type TrialCondition struct {
type TrialConditionType string

const (
TrialCreated TrialConditionType = "Created"
TrialRunning TrialConditionType = "Running"
TrialSucceeded TrialConditionType = "Succeeded"
TrialKilled TrialConditionType = "Killed"
TrialFailed TrialConditionType = "Failed"
TrialEarlyStopped TrialConditionType = "EarlyStopped"
TrialCreated TrialConditionType = "Created"
TrialRunning TrialConditionType = "Running"
TrialSucceeded TrialConditionType = "Succeeded"
TrialKilled TrialConditionType = "Killed"
TrialFailed TrialConditionType = "Failed"
TrialMetricsUnavailable TrialConditionType = "MetricsUnavailable"
TrialEarlyStopped TrialConditionType = "EarlyStopped"
)

// +genclient
Expand Down
16 changes: 10 additions & 6 deletions pkg/apis/controller/trials/v1beta1/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,15 +86,11 @@ func (trial *Trial) IsKilled() bool {

// IsMetricsUnavailable returns true if Trial metrics are not available
func (trial *Trial) IsMetricsUnavailable() bool {
cond := getCondition(trial, TrialSucceeded)
if cond != nil && cond.Status == v1.ConditionFalse {
return true
}
return false
return hasCondition(trial, TrialMetricsUnavailable)
}

func (trial *Trial) IsCompleted() bool {
return trial.IsSucceeded() || trial.IsFailed() || trial.IsKilled() || trial.IsEarlyStopped()
return trial.IsSucceeded() || trial.IsFailed() || trial.IsKilled() || trial.IsEarlyStopped() || trial.IsMetricsUnavailable()
}

func (trial *Trial) IsEarlyStopped() bool {
Expand Down Expand Up @@ -158,3 +154,11 @@ func (trial *Trial) MarkTrialStatusKilled(reason, message string) {
}
trial.setCondition(TrialKilled, v1.ConditionTrue, reason, message)
}

func (trial *Trial) MarkTrialStatusMetricsUnavailable(reason, message string) {
currentCond := getCondition(trial, TrialRunning)
if currentCond != nil {
trial.setCondition(TrialRunning, v1.ConditionFalse, currentCond.Reason, currentCond.Message)
}
trial.setCondition(TrialMetricsUnavailable, v1.ConditionTrue, reason, message)
}
269 changes: 137 additions & 132 deletions pkg/apis/manager/v1beta1/api.pb.go

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions pkg/apis/manager/v1beta1/api.proto
Original file line number Diff line number Diff line change
Expand Up @@ -226,16 +226,17 @@ message ParameterAssignment {
* Current Trial status. It contains Trial's latest condition, start time, completion time, observation.
*/
message TrialStatus {
// Trial can be in one of 6 conditions.
// Trial can be in one of 8 conditions.
// TODO (andreyvelich): Remove unused conditions.
enum TrialConditionType {
CREATED = 0;
RUNNING = 1;
SUCCEEDED = 2;
KILLED = 3;
FAILED = 4;
EARLYSTOPPED = 5;
UNKNOWN = 6;
METRICSUNAVAILABLE = 5;
EARLYSTOPPED = 6;
UNKNOWN = 7;
}
string start_time = 1; // Trial start time in RFC3339 format
string completion_time = 2; // Trial completion time in RFC3339 format
Expand Down
7 changes: 4 additions & 3 deletions pkg/apis/manager/v1beta1/gen-doc/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -771,7 +771,7 @@ Types of value for HyperParameter.
<a name="api-v1-beta1-TrialStatus-TrialConditionType"></a>

### TrialStatus.TrialConditionType
Trial can be in one of 6 conditions.
Trial can be in one of 8 conditions.
TODO (andreyvelich): Remove unused conditions.

| Name | Number | Description |
Expand All @@ -781,8 +781,9 @@ TODO (andreyvelich): Remove unused conditions.
| SUCCEEDED | 2 | |
| KILLED | 3 | |
| FAILED | 4 | |
| EARLYSTOPPED | 5 | |
| UNKNOWN | 6 | |
| METRICSUNAVAILABLE | 5 | |
| EARLYSTOPPED | 6 | |
| UNKNOWN | 7 | |



Expand Down
12 changes: 9 additions & 3 deletions pkg/apis/manager/v1beta1/gen-doc/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -1777,7 +1777,7 @@ <h3 id="api.v1.beta1.ParameterType">ParameterType</h3>
</table>

<h3 id="api.v1.beta1.TrialStatus.TrialConditionType">TrialStatus.TrialConditionType</h3>
<p>Trial can be in one of 6 conditions.</p><p>TODO (andreyvelich): Remove unused conditions.</p>
<p>Trial can be in one of 8 conditions.</p><p>TODO (andreyvelich): Remove unused conditions.</p>
<table class="enum-table">
<thead>
<tr><td>Name</td><td>Number</td><td>Description</td></tr>
Expand Down Expand Up @@ -1815,17 +1815,23 @@ <h3 id="api.v1.beta1.TrialStatus.TrialConditionType">TrialStatus.TrialConditionT
</tr>

<tr>
<td>EARLYSTOPPED</td>
<td>METRICSUNAVAILABLE</td>
<td>5</td>
<td><p></p></td>
</tr>

<tr>
<td>UNKNOWN</td>
<td>EARLYSTOPPED</td>
<td>6</td>
<td><p></p></td>
</tr>

<tr>
<td>UNKNOWN</td>
<td>7</td>
<td><p></p></td>
</tr>

</tbody>
</table>

Expand Down
124 changes: 64 additions & 60 deletions pkg/apis/manager/v1beta1/python/api_pb2.py

Large diffs are not rendered by default.

22 changes: 22 additions & 0 deletions pkg/apis/v1beta1/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions pkg/apis/v1beta1/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,14 @@
"description": "Represents last time when the Experiment was reconciled. It is not guaranteed to be set in happens-before order across separate operations. It is represented in RFC3339 form and is in UTC.",
"$ref": "#/definitions/v1.Time"
},
"metricsUnavailableTrialList": {
"description": "List of trial names which have been metrics unavailable",
"type": "array",
"items": {
"type": "string",
"default": ""
}
},
"pendingTrialList": {
"description": "List of trial names which are pending.",
"type": "array",
Expand Down Expand Up @@ -718,6 +726,11 @@
"default": ""
}
},
"trialMetricsUnavailable": {
"description": "How many trials are currently metrics unavailable.",
"type": "integer",
"format": "int32"
},
"trials": {
"description": "Trials is the total number of trials owned by the experiment.",
"type": "integer",
Expand Down
22 changes: 17 additions & 5 deletions pkg/controller.v1beta1/experiment/util/status_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,13 @@ func updateTrialsSummary(instance *experimentsv1beta1.Experiment, trials *trials
var bestTrialValue float64
sts := &instance.Status
sts.Trials = 0
sts.RunningTrialList, sts.PendingTrialList, sts.FailedTrialList, sts.SucceededTrialList, sts.KilledTrialList, sts.EarlyStoppedTrialList = nil, nil, nil, nil, nil, nil
sts.RunningTrialList = nil
sts.PendingTrialList = nil
sts.FailedTrialList = nil
sts.SucceededTrialList = nil
sts.KilledTrialList = nil
sts.EarlyStoppedTrialList = nil
sts.MetricsUnavailableTrialList = nil
bestTrialIndex := -1
isObjectiveGoalReached := false
var objectiveValueGoal float64
Expand All @@ -79,6 +85,8 @@ func updateTrialsSummary(instance *experimentsv1beta1.Experiment, trials *trials
sts.EarlyStoppedTrialList = append(sts.EarlyStoppedTrialList, trial.Name)
} else if trial.IsRunning() {
sts.RunningTrialList = append(sts.RunningTrialList, trial.Name)
} else if trial.IsMetricsUnavailable() {
sts.MetricsUnavailableTrialList = append(sts.MetricsUnavailableTrialList, trial.Name)
} else {
sts.PendingTrialList = append(sts.PendingTrialList, trial.Name)
}
Expand All @@ -95,7 +103,7 @@ func updateTrialsSummary(instance *experimentsv1beta1.Experiment, trials *trials
continue
}

//initialize vars to objective metric value of the first trial
// initialize vars to objective metric value of the first trial
if bestTrialIndex == -1 {
bestTrialValue = objectiveMetricValue
bestTrialIndex = index
Expand Down Expand Up @@ -126,6 +134,7 @@ func updateTrialsSummary(instance *experimentsv1beta1.Experiment, trials *trials
sts.TrialsFailed = int32(len(sts.FailedTrialList))
sts.TrialsKilled = int32(len(sts.KilledTrialList))
sts.TrialsEarlyStopped = int32(len(sts.EarlyStoppedTrialList))
sts.TrialMetricsUnavailable = int32(len(sts.MetricsUnavailableTrialList))

// if best trial is set
if bestTrialIndex != -1 {
Expand Down Expand Up @@ -177,8 +186,9 @@ func getObjectiveMetricValue(trial trialsv1beta1.Trial) string {
// UpdateExperimentStatusCondition updates the experiment status.
func UpdateExperimentStatusCondition(collector *ExperimentsCollector, instance *experimentsv1beta1.Experiment, isObjectiveGoalReached bool, getSuggestionDone bool) {
logger := log.WithValues("Experiment", types.NamespacedName{Name: instance.GetName(), Namespace: instance.GetNamespace()})
completedTrialsCount := instance.Status.TrialsSucceeded + instance.Status.TrialsFailed + instance.Status.TrialsKilled + instance.Status.TrialsEarlyStopped
failedTrialsCount := instance.Status.TrialsFailed
completedTrialsCount :=
instance.Status.TrialsSucceeded + instance.Status.TrialsFailed + instance.Status.TrialsKilled + instance.Status.TrialsEarlyStopped + instance.Status.TrialMetricsUnavailable
failedTrialsCount := instance.Status.TrialsFailed + instance.Status.TrialMetricsUnavailable
activeTrialsCount := instance.Status.TrialsPending + instance.Status.TrialsRunning
now := metav1.Now()

Expand All @@ -192,7 +202,9 @@ func UpdateExperimentStatusCondition(collector *ExperimentsCollector, instance *
}

// First check if MaxFailedTrialCount is reached.
if (instance.Spec.MaxFailedTrialCount != nil) && (failedTrialsCount > *instance.Spec.MaxFailedTrialCount) {
if (instance.Spec.MaxFailedTrialCount != nil) &&
((*instance.Spec.MaxFailedTrialCount != *instance.Spec.MaxTrialCount && failedTrialsCount > *instance.Spec.MaxFailedTrialCount) ||
(*instance.Spec.MaxFailedTrialCount == *instance.Spec.MaxTrialCount && failedTrialsCount == *instance.Spec.MaxFailedTrialCount)) {
msg := "Experiment has failed because max failed count has reached"
instance.MarkExperimentStatusFailed(ExperimentFailedReason, msg)
instance.Status.CompletionTime = &now
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -848,8 +848,8 @@ func newFakeTrials() []trialsv1beta1.Trial {
Status: trialsv1beta1.TrialStatus{
Conditions: []trialsv1beta1.TrialCondition{
{
Type: trialsv1beta1.TrialSucceeded,
Status: corev1.ConditionFalse,
Type: trialsv1beta1.TrialMetricsUnavailable,
Status: corev1.ConditionTrue,
Message: "Metrics are not available",
},
},
Expand Down
3 changes: 1 addition & 2 deletions pkg/controller.v1beta1/trial/trial_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,9 +230,8 @@ func (r *ReconcileTrial) reconcileTrial(instance *trialsv1beta1.Trial) error {

// Job already exists.
// If Trial is EarlyStopped we need to verify/update observation logs.
// TODO (andreyvelich): We can include "MetricsUnavailable" condition to "Complete".
// In that case, Trial's job will be deleted even if metrics are not available.
if deployedJob != nil && ((!instance.IsCompleted() && !instance.IsMetricsUnavailable()) || instance.IsEarlyStopped()) {
if deployedJob != nil && (!instance.IsCompleted() || instance.IsEarlyStopped()) {
jobStatus, err := trialutil.GetDeployedJobStatus(instance, deployedJob)
if err != nil {
logger.Error(err, "GetDeployedJobStatus error")
Expand Down
19 changes: 7 additions & 12 deletions pkg/controller.v1beta1/trial/trial_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ limitations under the License.
package trial

import (
"fmt"
"sync"
"testing"
"time"
Expand All @@ -41,7 +40,7 @@ import (
api_pb "github.com/kubeflow/katib/pkg/apis/manager/v1beta1"
"github.com/kubeflow/katib/pkg/controller.v1beta1/consts"
trialutil "github.com/kubeflow/katib/pkg/controller.v1beta1/trial/util"
util "github.com/kubeflow/katib/pkg/controller.v1beta1/util"
"github.com/kubeflow/katib/pkg/controller.v1beta1/util"
managerclientmock "github.com/kubeflow/katib/pkg/mock/v1beta1/trial/managerclient"
)

Expand Down Expand Up @@ -267,8 +266,8 @@ func TestReconcileBatchJob(t *testing.T) {
}
return trial.IsSucceeded() &&
len(trial.Status.Observation.Metrics) > 0 &&
trial.Status.Observation.Metrics[0].Max == "0.99" &&
trial.Status.Observation.Metrics[0].Min == "0.11" &&
trial.Status.Observation.Metrics[0].Max == "0.99" &&
trial.Status.Observation.Metrics[0].Latest == "0.11"
}, timeout).Should(gomega.BeTrue())

Expand All @@ -291,15 +290,11 @@ func TestReconcileBatchJob(t *testing.T) {
if err = c.Get(ctx, trialKey, trial); err != nil {
return false
}
isConditionCorrect := false
for _, cond := range trial.Status.Conditions {
if cond.Type == trialsv1beta1.TrialSucceeded && cond.Status == corev1.ConditionFalse &&
cond.Reason == fmt.Sprintf("%v. Job reason: %v", TrialMetricsUnavailableReason, batchJobCompleteReason) &&
cond.Message == fmt.Sprintf("Metrics are not available. Job message: %v", batchJobCompleteMessage) {
isConditionCorrect = true
}
}
return isConditionCorrect
return trial.IsMetricsUnavailable() &&
len(trial.Status.Observation.Metrics) > 0 &&
trial.Status.Observation.Metrics[0].Min == consts.UnavailableMetricValue &&
trial.Status.Observation.Metrics[0].Max == consts.UnavailableMetricValue &&
trial.Status.Observation.Metrics[0].Latest == consts.UnavailableMetricValue
}, timeout).Should(gomega.BeTrue())

// Delete the Trial
Expand Down
6 changes: 3 additions & 3 deletions pkg/controller.v1beta1/trial/trial_controller_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,6 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1beta1.Tria
r.collector.IncreaseTrialsSucceededCount(instance.Namespace)
}
} else if !instance.IsMetricsUnavailable() {
// TODO (andreyvelich): Is it correct to mark succeeded status false when metrics are unavailable?
// Ref issue to add new condition: https://github.com/kubeflow/katib/issues/1343
msg := "Metrics are not available"
reason := TrialMetricsUnavailableReason

Expand All @@ -81,10 +79,12 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1beta1.Tria
}

logger.Info("Trial status changed to Metrics Unavailable")
instance.MarkTrialStatusSucceeded(corev1.ConditionFalse, reason, msg)
instance.MarkTrialStatusMetricsUnavailable(reason, msg)
instance.Status.CompletionTime = &timeNow

eventMsg := fmt.Sprintf("Metrics are not available for Job %v", deployedJobName)
r.recorder.Eventf(instance, corev1.EventTypeWarning, JobMetricsUnavailableReason, eventMsg)
r.collector.IncreaseTrialsMetricsUnavailableCount(instance.Namespace)
}
} else if jobStatus.Condition == trialutil.JobFailed && !instance.IsFailed() && !instance.IsEarlyStopped() {
msg := "Trial has failed"
Expand Down
Loading

0 comments on commit 657b316

Please sign in to comment.