Skip to content

Commit

Permalink
Merge pull request opendatahub-io#125 from zdtsw-forking/rhos-2.4-fix…
Browse files Browse the repository at this point in the history
…-upgrade

fix: upgrade from v1 to v2
  • Loading branch information
etirelli authored Nov 20, 2023
2 parents c8a6d91 + 997acea commit acbd9e1
Show file tree
Hide file tree
Showing 3 changed files with 178 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,11 @@ func (r *DataScienceClusterReconciler) Reconcile(ctx context.Context, req ctrl.R
// Owned objects are automatically garbage collected. For additional cleanup logic use operatorUninstall function.
// Return and don't requeue
if upgrade.HasDeleteConfigMap(r.Client) {
return reconcile.Result{}, fmt.Errorf("error while operator uninstall: %v",
upgrade.OperatorUninstall(r.Client, r.RestConfig))
if uninstallErr := upgrade.OperatorUninstall(r.Client, r.RestConfig); uninstallErr != nil {
return ctrl.Result{}, fmt.Errorf("error while operator uninstall: %v", uninstallErr)
}
}

return ctrl.Result{}, nil
}

Expand Down
2 changes: 1 addition & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ func main() {
}

// Apply update from legacy operator
if err = upgrade.UpdateFromLegacyVersion(setupClient, platform); err != nil {
if err = upgrade.UpdateFromLegacyVersion(setupClient, platform, dscApplicationsNamespace, dscMonitoringNamespace); err != nil {
setupLog.Error(err, "unable to update from legacy operator version")
}

Expand Down
184 changes: 173 additions & 11 deletions pkg/upgrade/upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,12 @@ import (
"context"
"encoding/json"
"fmt"
"github.com/hashicorp/go-multierror"
"github.com/opendatahub-io/opendatahub-operator/v2/components/trustyai"
appsv1 "k8s.io/api/apps/v1"
"k8s.io/apimachinery/pkg/util/wait"
"os"
// "reflect"
"strings"
"time"

Expand Down Expand Up @@ -257,28 +261,39 @@ func CreateDefaultDSCI(cli client.Client, platform deploy.Platform, appNamespace
return nil
}

func UpdateFromLegacyVersion(cli client.Client, platform deploy.Platform) error {
func UpdateFromLegacyVersion(cli client.Client, platform deploy.Platform, appNS string, montNamespace string) error {
// If platform is Managed, remove Kfdefs and create default dsc
if platform == deploy.ManagedRhods {
err := CreateDefaultDSC(cli, platform)
if err != nil {
fmt.Println("starting deletion of Deloyments in managed cluster")
if err := deleteResource(cli, appNS, "deployment"); err != nil {
return err
}
// this is for the modelmesh monitoring part from v1 to v2
if err := deleteResource(cli, montNamespace, "deployment"); err != nil {
return err
}
if err := deleteResource(cli, montNamespace, "statefulset"); err != nil {
return err
}
if err := CreateDefaultDSC(cli, platform); err != nil {
return err
}

err = RemoveKfDefInstances(cli, platform)
if err != nil {
if err := RemoveKfDefInstances(cli, platform); err != nil {
return err
}

return nil
}

if platform == deploy.SelfManagedRhods {
fmt.Println("starting deletion of Deloyments in selfmanaged cluster")
// If KfDef CRD is not found, we see it as a cluster not pre-installed v1 operator // Check if kfdef are deployed
kfdefCrd := &apiextv1.CustomResourceDefinition{}
err := cli.Get(context.TODO(), client.ObjectKey{Name: "kfdefs.kfdef.apps.kubeflow.org"}, kfdefCrd)
if err != nil {
if err := cli.Get(context.TODO(), client.ObjectKey{Name: "kfdefs.kfdef.apps.kubeflow.org"}, kfdefCrd); err != nil {
if apierrs.IsNotFound(err) {
// If no Crd found, return, since its a new Installation
// return empty list
return nil
} else {
return fmt.Errorf("error retrieving kfdef CRD : %v", err)
Expand All @@ -288,18 +303,27 @@ func UpdateFromLegacyVersion(cli client.Client, platform deploy.Platform) error
// If KfDef Instances found, and no DSC instances are found in Self-managed, that means this is an upgrade path from
// legacy version. Create a default DSC instance
kfDefList := &kfdefv1.KfDefList{}
err = cli.List(context.TODO(), kfDefList)
err := cli.List(context.TODO(), kfDefList)
if err != nil {
if apierrs.IsNotFound(err) {
// If no KfDefs, do nothing and return
return nil
} else {
return fmt.Errorf("error getting list of kfdefs: %v", err)
return fmt.Errorf("error getting kfdef instances: : %w", err)
}
}
if len(kfDefList.Items) > 0 {
err := CreateDefaultDSC(cli, platform)
if err != nil {
if err = deleteResource(cli, appNS, "deployment"); err != nil {
return fmt.Errorf("error deleting deployment: %w", err)
}
// this is for the modelmesh monitoring part from v1 to v2
if err := deleteResource(cli, montNamespace, "deployment"); err != nil {
return err
}
if err := deleteResource(cli, montNamespace, "statefulset"); err != nil {
return err
}
if err = CreateDefaultDSC(cli, platform); err != nil {
return err
}
}
Expand Down Expand Up @@ -409,3 +433,141 @@ func getClusterServiceVersion(cfg *rest.Config, watchNameSpace string) (*ofapi.C
}
return nil, nil
}

func deleteResource(cli client.Client, namespace string, resourceType string) error {
// In v2, Deployment selectors use a label "app.opendatahub.io/<componentName>" which is
// not present in v1. Since label selectors are immutable, we need to delete the existing
// deployments and recreated them.
// because we can't proceed if a deployment is not deleted, we use exponential backoff
// to retry the deletion until it succeeds
var err error
switch resourceType {
case "deployment":
err = wait.ExponentialBackoffWithContext(context.TODO(), wait.Backoff{
// 5, 10, ,20, 40 then timeout
Duration: 5 * time.Second,
Factor: 2.0,
Jitter: 0.1,
Steps: 4,
Cap: 1 * time.Minute,
}, func(ctx context.Context) (bool, error) {
done, err := deleteDeploymentsAndCheck(ctx, cli, namespace)
return done, err
})
case "statefulset":
err = wait.ExponentialBackoffWithContext(context.TODO(), wait.Backoff{
// 10, 20 then timeout
Duration: 10 * time.Second,
Factor: 2.0,
Jitter: 0.1,
Steps: 2,
Cap: 1 * time.Minute,
}, func(ctx context.Context) (bool, error) {
done, err := deleteStatefulsetsAndCheck(ctx, cli, namespace)
return done, err
})
}
return err
}

func deleteDeploymentsAndCheck(ctx context.Context, cli client.Client, namespace string) (bool, error) { //nolint
// Delete Deployment objects
var multiErr *multierror.Error
deployments := &appsv1.DeploymentList{}
listOpts := &client.ListOptions{
Namespace: namespace,
}

if err := cli.List(ctx, deployments, listOpts); err != nil {
return false, nil
}
// filter deployment which has the new label to limit that we do not over kill other deployment
// this logic can be used even when upgrade from v2.4 to v2.5 without remove it
markedForDeletion := []appsv1.Deployment{}
for _, deployment := range deployments.Items {
v2 := false
selectorLabels := deployment.Spec.Selector.MatchLabels
for label := range selectorLabels {
if strings.Contains(label, "app.opendatahub.io/") {
// this deployment has the new label, this is a v2 to v2 upgrade
// there is no need to recreate it, as labels are matching
v2 = true
continue
}
}
if !v2 {
markedForDeletion = append(markedForDeletion, deployment)
multiErr = multierror.Append(multiErr, cli.Delete(ctx, &deployment))
}
}

for _, deployment := range markedForDeletion {
if e := cli.Get(ctx, client.ObjectKey{
Namespace: namespace,
Name: deployment.Name,
}, &deployment); e != nil {
if apierrs.IsNotFound(e) {
// resource has been successfully deleted
continue
} else {
// unexpected error, report it
multiErr = multierror.Append(multiErr, e)
}
} else {
// resource still exists, wait for it to be deleted
return false, nil
}
}

return true, multiErr.ErrorOrNil()
}

func deleteStatefulsetsAndCheck(ctx context.Context, cli client.Client, namespace string) (bool, error) { //nolint
// Delete statefulset objects
var multiErr *multierror.Error
statefulsets := &appsv1.StatefulSetList{}
listOpts := &client.ListOptions{
Namespace: namespace,
}

if err := cli.List(ctx, statefulsets, listOpts); err != nil {
return false, nil
}

// even only we have one item to delete to avoid nil point still use range
markedForDeletion := []appsv1.StatefulSet{}
for _, statefulset := range statefulsets.Items {
v2 := false
selectorLabels := statefulset.Spec.Selector.MatchLabels
for label := range selectorLabels {
if strings.Contains(label, "app.opendatahub.io/") {
v2 = true
continue
}
}
if !v2 {
markedForDeletion = append(markedForDeletion, statefulset)
multiErr = multierror.Append(multiErr, cli.Delete(ctx, &statefulset))
}
}

for _, statefulset := range markedForDeletion {
if e := cli.Get(ctx, client.ObjectKey{
Namespace: namespace,
Name: statefulset.Name,
}, &statefulset); e != nil {
if apierrs.IsNotFound(e) {
// resource has been successfully deleted
continue
} else {
// unexpected error, report it
multiErr = multierror.Append(multiErr, e)
}
} else {
// resource still exists, wait for it to be deleted
return false, nil
}
}

return true, multiErr.ErrorOrNil()
}

0 comments on commit acbd9e1

Please sign in to comment.