diff --git a/.github/workflows/build-and-push.yaml b/.github/workflows/build-and-push.yaml index 0ff71d20..8c9de159 100644 --- a/.github/workflows/build-and-push.yaml +++ b/.github/workflows/build-and-push.yaml @@ -107,6 +107,8 @@ jobs: if: env.BUILD_CONTEXT == 'ci' run: | sed -i "s#quay.io/trustyai/trustyai-service-operator:latest#${{ env.IMAGE_NAME }}:$TAG#" ./config/base/params.env + sed -i "s#quay.io/trustyai/trustyai-service-operator:latest#${{ env.IMAGE_NAME }}:$TAG#" ./config/overlays/odh/params.env + sed -i "s#quay.io/trustyai/trustyai-service-operator:latest#${{ env.IMAGE_NAME }}:$TAG#" ./config/overlays/rhoai/params.env rm -Rf $(ls . | grep -v config) rm -Rf .gitignore .dockerignore .github .git .yamllint.yaml # pysh to ci-manifest repo @@ -146,4 +148,12 @@ jobs: 📦 [LMES job image](https://quay.io/trustyai/ta-lmes-job:${{ github.event.pull_request.head.sha }}): `quay.io/trustyai/ta-lmes-job:${{ github.event.pull_request.head.sha }}` 🗂️ [CI manifests](https://github.com/trustyai-explainability/trustyai-service-operator-ci/tree/operator-${{ env.TAG }}) + + ``` + devFlags: + manifests: + - contextDir: config + sourcePath: '' + uri: https://api.github.com/repos/trustyai-explainability/trustyai-service-operator-ci/tarball/operator-${{ env.TAG }} + ``` diff --git a/README.md b/README.md index aa49524e..14f1a38b 100644 --- a/README.md +++ b/README.md @@ -151,10 +151,20 @@ through its `status` field. Below are the status types and reasons that are avai | `PVCAvailable` | `PVCNotFound` | `PersistentVolumeClaim` not found. | | `PVCAvailable` | `PVCFound` | `PersistentVolumeClaim` found. | +#### Database Status + +| Status Type | Status Reason | Description | +|---------------|-------------------------|---------------------------------------------------| +| `DBAvailable` | `DBCredentialsNotFound` | Database credentials secret not found | +| `DBAvailable` | `DBCredentialsError` | Database credentials malformed (e.g. missing key) | +| `DBAvailable` | `DBConnectionError` | Service error connecting to the database | +| `DBAvailable` | `DBAvailable` | Successfully connected to the database | + #### Status Behavior - If a PVC is not available, the `Ready` status of `TrustyAIService` will be set to `False`. +- If on database mode, any `DBAvailable` reason other than `DBAvailable` will set the `TrustyAIService` to `Not Ready` - However, if `InferenceServices` are not found, the `Ready` status of `TrustyAIService` will not be affected, _i.e._, it is `Ready` by all other conditions, it will remain so. ## Contributing diff --git a/cmd/main.go b/cmd/main.go index d5771309..dcb17fef 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -25,6 +25,7 @@ import ( kservev1beta1 "github.com/kserve/kserve/pkg/apis/serving/v1beta1" routev1 "github.com/openshift/api/route/v1" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) // to ensure that exec-entrypoint and run can make use of them. @@ -58,6 +59,7 @@ func init() { utilruntime.Must(kservev1alpha1.AddToScheme(scheme)) utilruntime.Must(kservev1beta1.AddToScheme(scheme)) utilruntime.Must(routev1.AddToScheme(scheme)) + utilruntime.Must(apiextensionsv1.AddToScheme(scheme)) //+kubebuilder:scaffold:scheme } diff --git a/config/base/params.env b/config/base/params.env index f7ff45d0..e0de8034 100644 --- a/config/base/params.env +++ b/config/base/params.env @@ -9,3 +9,4 @@ lmes-image-pull-policy=Always lmes-max-batch-size=24 lmes-default-batch-size=8 lmes-detect-device=true + diff --git a/config/overlays/rhoai/kustomization.yaml b/config/overlays/rhoai/kustomization.yaml index 7ad939f1..27ce6516 100644 --- a/config/overlays/rhoai/kustomization.yaml +++ b/config/overlays/rhoai/kustomization.yaml @@ -10,4 +10,4 @@ patchesStrategicMerge: configMapGenerator: - env: params.env behavior: merge - name: config \ No newline at end of file + name: config diff --git a/config/rbac/auth_proxy_service.yaml b/config/rbac/auth_proxy_service.yaml index cce9af6e..85814501 100644 --- a/config/rbac/auth_proxy_service.yaml +++ b/config/rbac/auth_proxy_service.yaml @@ -16,6 +16,6 @@ spec: - name: https port: 8443 protocol: TCP - targetPort: 8080 + targetPort: 8081 selector: control-plane: controller-manager diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 5c2df527..38d54ff1 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -57,6 +57,14 @@ rules: - list - update - watch +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch - apiGroups: - apps resources: @@ -99,6 +107,19 @@ rules: - create - list - watch +- apiGroups: + - networking.istio.io + resources: + - destinationrules + - virtualservices + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - rbac.authorization.k8s.io resources: diff --git a/controllers/tas/constants.go b/controllers/tas/constants.go index a2b32819..989900f6 100644 --- a/controllers/tas/constants.go +++ b/controllers/tas/constants.go @@ -47,6 +47,7 @@ const ( StatusTypePVCAvailable = "PVCAvailable" StatusTypeRouteAvailable = "RouteAvailable" StatusTypeAvailable = "Available" + StatusTypeDBAvailable = "DBAvailable" ) // Status reasons @@ -59,6 +60,10 @@ const ( StatusReasonRouteFound = "RouteFound" StatusAvailable = "AllComponentsReady" StatusNotAvailable = "NotAllComponentsReady" + StatusDBCredentialsNotFound = "DBCredentialsNotFound" + StatusDBCredentialsError = "DBCredentialsError" + StatusDBConnectionError = "DBConnectionError" + StatusDBAvailable = "DBAvailable" ) // Event reasons @@ -68,4 +73,14 @@ const ( EventReasonServiceMonitorCreated = "ServiceMonitorCreated" ) +const ( + StateReasonCrashLoopBackOff = "CrashLoopBackOff" +) + +// Phases +const ( + PhaseReady = "Ready" + PhaseNotReady = "Not Ready" +) + const migrationAnnotationKey = "trustyai.opendatahub.io/db-migration" diff --git a/controllers/tas/database.go b/controllers/tas/database.go new file mode 100644 index 00000000..014b55d9 --- /dev/null +++ b/controllers/tas/database.go @@ -0,0 +1,62 @@ +package tas + +import ( + "context" + "strings" + + trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/tas/v1alpha1" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +func (r *TrustyAIServiceReconciler) checkDatabaseAccessible(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) { + deployment := &appsv1.Deployment{} + err := r.Get(ctx, types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace}, deployment) + if err != nil { + if errors.IsNotFound(err) { + return false, nil + } + return false, err + } + + for _, cond := range deployment.Status.Conditions { + if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue { + podList := &corev1.PodList{} + listOpts := []client.ListOption{ + client.InNamespace(instance.Namespace), + client.MatchingLabels(deployment.Spec.Selector.MatchLabels), + } + if err := r.List(ctx, podList, listOpts...); err != nil { + return false, err + } + + for _, pod := range podList.Items { + for _, cs := range pod.Status.ContainerStatuses { + if cs.Name == "trustyai-service" { + if cs.State.Running != nil { + return true, nil + } + + if cs.LastTerminationState.Terminated != nil { + termination := cs.LastTerminationState.Terminated + if termination.Reason == "Error" && termination.Message != "" { + if strings.Contains(termination.Message, "Socket fail to connect to host:address") { + return false, nil + } + } + } + + if cs.State.Waiting != nil && cs.State.Waiting.Reason == StateReasonCrashLoopBackOff { + return false, nil + } + } + } + } + } + } + + return false, nil +} diff --git a/controllers/tas/deployment.go b/controllers/tas/deployment.go index 0dcc434e..4d95740e 100644 --- a/controllers/tas/deployment.go +++ b/controllers/tas/deployment.go @@ -5,15 +5,15 @@ import ( "reflect" "strconv" + trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/tas/v1alpha1" "github.com/trustyai-explainability/trustyai-service-operator/controllers/constants" templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/tas/templates" - - trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/tas/v1alpha1" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" ) @@ -74,13 +74,13 @@ func (r *TrustyAIServiceReconciler) createDeploymentObject(ctx context.Context, } if instance.Spec.Storage.IsStorageDatabase() { - _, err := r.getSecret(ctx, instance.Name+"-db-tls", instance.Namespace) + _, err := r.getSecret(ctx, instance.Name+"-db-ca", instance.Namespace) if err != nil { deploymentConfig.UseDBTLSCerts = false - log.FromContext(ctx).Error(err, "Using insecure database connection. Certificates "+instance.Name+"-db-tls not found") + log.FromContext(ctx).Info("Using insecure database connection. Certificates " + instance.Name + "-db-ca not found") } else { deploymentConfig.UseDBTLSCerts = true - log.FromContext(ctx).Info("Using secure database connection with certificates " + instance.Name + "-db-tls") + log.FromContext(ctx).Info("Using secure database connection with certificates " + instance.Name + "-db-ca") } } else { deploymentConfig.UseDBTLSCerts = false @@ -203,6 +203,7 @@ func (r *TrustyAIServiceReconciler) ensureDeployment(ctx context.Context, instan return nil } +// checkDeploymentReady verifies that a TrustyAI service deployment is ready func (r *TrustyAIServiceReconciler) checkDeploymentReady(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) { deployment := &appsv1.Deployment{} @@ -217,6 +218,26 @@ func (r *TrustyAIServiceReconciler) checkDeploymentReady(ctx context.Context, in for _, cond := range deployment.Status.Conditions { if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue { if deployment.Status.ReadyReplicas == *deployment.Spec.Replicas { + podList := &corev1.PodList{} + listOpts := []client.ListOption{ + client.InNamespace(instance.Namespace), + client.MatchingLabels(deployment.Spec.Selector.MatchLabels), + } + if err := r.List(ctx, podList, listOpts...); err != nil { + return false, err + } + + for _, pod := range podList.Items { + for _, cs := range pod.Status.ContainerStatuses { + if cs.State.Waiting != nil && cs.State.Waiting.Reason == StateReasonCrashLoopBackOff { + return false, nil + } + if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 { + return false, nil + } + } + } + return true, nil } } diff --git a/controllers/tas/destination_rule.go b/controllers/tas/destination_rule.go new file mode 100644 index 00000000..caf32336 --- /dev/null +++ b/controllers/tas/destination_rule.go @@ -0,0 +1,89 @@ +package tas + +import ( + "context" + "fmt" + "reflect" + + trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/tas/v1alpha1" + templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/tas/templates" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +const ( + destinationRuleTemplatePath = "service/destination-rule.tmpl.yaml" + destinationRuleCDRName = "destinationrules.networking.istio.io" +) + +// DestinationRuleConfig has the variables for the DestinationRule template +type DestinationRuleConfig struct { + Name string + Namespace string + DestinationRuleName string +} + +// isDestinationRuleCRDPresent returns true if the DestinationRule CRD is present, false otherwise +func (r *TrustyAIServiceReconciler) isDestinationRuleCRDPresent(ctx context.Context) (bool, error) { + crd := &apiextensionsv1.CustomResourceDefinition{} + + err := r.Get(ctx, types.NamespacedName{Name: destinationRuleCDRName}, crd) + if err != nil { + if !errors.IsNotFound(err) { + return false, fmt.Errorf("error getting "+destinationRuleCDRName+" CRD: %v", err) + } + // Not found + return false, nil + } + + // Found + return true, nil +} + +func (r *TrustyAIServiceReconciler) ensureDestinationRule(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) error { + + destinationRuleName := instance.Name + "-internal" + + existingDestinationRule := &unstructured.Unstructured{} + existingDestinationRule.SetKind("DestinationRule") + existingDestinationRule.SetAPIVersion("networking.istio.io/v1beta1") + + // Check if the DestinationRule already exists + err := r.Get(ctx, types.NamespacedName{Name: destinationRuleName, Namespace: instance.Namespace}, existingDestinationRule) + if err == nil { + // DestinationRule exists + return nil + } + + if !errors.IsNotFound(err) { + return fmt.Errorf("failed to check for existing DestinationRule: %v", err) + } + + destinationRuleConfig := DestinationRuleConfig{ + Name: instance.Name, + Namespace: instance.Namespace, + DestinationRuleName: destinationRuleName, + } + + var destinationRule *unstructured.Unstructured + destinationRule, err = templateParser.ParseResource[unstructured.Unstructured](destinationRuleTemplatePath, destinationRuleConfig, reflect.TypeOf(&unstructured.Unstructured{})) + if err != nil { + log.FromContext(ctx).Error(err, "could not parse the DestinationRule template") + return err + } + + if err := ctrl.SetControllerReference(instance, destinationRule, r.Scheme); err != nil { + return err + } + + err = r.Create(ctx, destinationRule) + if err != nil { + return fmt.Errorf("failed to create DestinationRule: %v", err) + } + + return nil +} diff --git a/controllers/tas/inference_services.go b/controllers/tas/inference_services.go index 072f5af9..950f4fde 100644 --- a/controllers/tas/inference_services.go +++ b/controllers/tas/inference_services.go @@ -260,7 +260,7 @@ func (r *TrustyAIServiceReconciler) handleInferenceServices(ctx context.Context, // patchKServe adds a TrustyAI service as an InferenceLogger to a KServe InferenceService func (r *TrustyAIServiceReconciler) patchKServe(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService, infService kservev1beta1.InferenceService, namespace string, crName string, remove bool) error { - url := utils.GenerateNonTLSServiceURL(crName, namespace) + url := utils.GenerateKServeLoggerURL(crName, namespace) if remove { if infService.Spec.Predictor.Logger == nil || *infService.Spec.Predictor.Logger.URL != url { @@ -291,6 +291,50 @@ func (r *TrustyAIServiceReconciler) patchKServe(ctx context.Context, instance *t infService.Spec.Predictor.Logger = &logger } + // Only if the Istio sidecar annotation is set + annotations := infService.GetAnnotations() + if inject, exists := annotations["sidecar.istio.io/inject"]; exists && inject == "true" { + + // Check if DestinationRule CRD is present. If there's an error, don't proceed and return the error + exists, err := r.isDestinationRuleCRDPresent(ctx) + if err != nil { + log.FromContext(ctx).Error(err, "Error verifying DestinationRule CRD is present") + return err + } + + // Try to create the DestinationRule, since CRD exists + if exists { + err := r.ensureDestinationRule(ctx, instance) + if err != nil { + return fmt.Errorf("failed to ensure DestinationRule: %v", err) + } + } else { + // DestinationRule CRD does not exist. Do not attempt to create it and log error + err := fmt.Errorf("the DestinationRule CRD is not present in this cluster") + log.FromContext(ctx).Error(err, "InferenceService has service mesh annotation but DestinationRule CRD not found") + } + + // Check if VirtualService CRD is present. If there's an error, don't proceed and return the error + exists, err = r.isVirtualServiceCRDPresent(ctx) + if err != nil { + log.FromContext(ctx).Error(err, "Error verifying VirtualService CRD is present") + return err + } + + // Try to create the VirtualService, since CRD exists + if exists { + err := r.ensureVirtualService(ctx, instance) + if err != nil { + return fmt.Errorf("failed to ensure VirtualService: %v", err) + } + } else { + // VirtualService CRD does not exist. Do not attempt to create it and log error + err := fmt.Errorf("the VirtualService CRD is not present in this cluster") + log.FromContext(ctx).Error(err, "InferenceService has service mesh annotation but VirtualService CRD not found") + } + + } + // Update the InferenceService err := r.Update(ctx, &infService) if err == nil { diff --git a/controllers/tas/statuses.go b/controllers/tas/statuses.go index 72f4431b..ca4d4402 100644 --- a/controllers/tas/statuses.go +++ b/controllers/tas/statuses.go @@ -13,7 +13,8 @@ import ( // IsAllReady checks if all the necessary readiness fields are true for the specific mode func (rs *AvailabilityStatus) IsAllReady(mode string) bool { - return (rs.PVCReady && rs.DeploymentReady && rs.RouteReady && mode == STORAGE_PVC) || (rs.DeploymentReady && rs.RouteReady && mode == STORAGE_DATABASE) + return (rs.PVCReady && rs.DeploymentReady && rs.RouteReady && mode == STORAGE_PVC) || + (rs.DeploymentReady && rs.RouteReady && rs.DBReady && mode == STORAGE_DATABASE) } // AvailabilityStatus has the readiness status of various resources. @@ -22,6 +23,7 @@ type AvailabilityStatus struct { DeploymentReady bool RouteReady bool InferenceServiceReady bool + DBReady bool } func (r *TrustyAIServiceReconciler) updateStatus(ctx context.Context, original *trustyaiopendatahubiov1alpha1.TrustyAIService, update func(saved *trustyaiopendatahubiov1alpha1.TrustyAIService), @@ -53,25 +55,17 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta if instance.Spec.Storage.IsStoragePVC() || instance.IsMigration() { // Check for PVC readiness status.PVCReady, err = r.checkPVCReady(ctx, instance) - if err != nil || !status.PVCReady { - // PVC not ready, requeue - return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "PVC not ready") - } } // Check for deployment readiness status.DeploymentReady, err = r.checkDeploymentReady(ctx, instance) - if err != nil || !status.DeploymentReady { - // Deployment not ready, requeue - return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "Deployment not ready") + + if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() { + status.DBReady, _ = r.checkDatabaseAccessible(ctx, instance) } // Check for route readiness status.RouteReady, err = r.checkRouteReady(ctx, instance) - if err != nil || !status.RouteReady { - // Route not ready, requeue - return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "Route not ready") - } // Check if InferenceServices present status.InferenceServiceReady, err = r.checkInferenceServicesPresent(ctx, instance.Namespace) @@ -89,9 +83,20 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta if instance.Spec.Storage.IsStoragePVC() || instance.IsMigration() { UpdatePVCAvailable(saved) } + UpdateRouteAvailable(saved) + + if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() { + if status.DBReady { + UpdateDBAvailable(saved) + } else { + UpdateDBConnectionError(saved) + return + } + } + UpdateTrustyAIServiceAvailable(saved) - saved.Status.Phase = "Ready" + saved.Status.Phase = PhaseReady saved.Status.Ready = v1.ConditionTrue }) if updateErr != nil { @@ -114,13 +119,18 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta } } + if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() { + UpdateDBConnectionError(saved) + } + if status.RouteReady { UpdateRouteAvailable(saved) } else { UpdateRouteNotAvailable(saved) } + UpdateTrustyAIServiceNotAvailable(saved) - saved.Status.Phase = "Ready" + saved.Status.Phase = PhaseNotReady saved.Status.Ready = v1.ConditionFalse }) if updateErr != nil { @@ -143,7 +153,7 @@ func UpdateInferenceServicePresent(saved *trustyaiopendatahubiov1alpha1.TrustyAI func UpdatePVCNotAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { saved.SetStatus(StatusTypePVCAvailable, StatusReasonPVCNotFound, "PersistentVolumeClaim not found", v1.ConditionFalse) - saved.Status.Phase = "Not Ready" + saved.Status.Phase = PhaseNotReady saved.Status.Ready = v1.ConditionFalse } @@ -165,4 +175,28 @@ func UpdateTrustyAIServiceAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyA func UpdateTrustyAIServiceNotAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { saved.SetStatus(StatusTypeAvailable, StatusNotAvailable, "Not all components available", v1.ConditionFalse) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse +} + +func UpdateDBCredentialsNotFound(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + saved.SetStatus(StatusTypeDBAvailable, StatusDBCredentialsNotFound, "Database credentials not found", v1.ConditionFalse) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse +} + +func UpdateDBCredentialsError(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + saved.SetStatus(StatusTypeDBAvailable, StatusDBCredentialsError, "Error with database credentials", v1.ConditionFalse) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse +} + +func UpdateDBConnectionError(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + saved.SetStatus(StatusTypeDBAvailable, StatusDBConnectionError, "Error connecting to database", v1.ConditionFalse) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse +} + +func UpdateDBAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + saved.SetStatus(StatusTypeDBAvailable, StatusDBAvailable, "Database available", v1.ConditionTrue) } diff --git a/controllers/tas/statuses_test.go b/controllers/tas/statuses_test.go index 34c31d57..b97b156c 100644 --- a/controllers/tas/statuses_test.go +++ b/controllers/tas/statuses_test.go @@ -35,7 +35,7 @@ func setupAndTestStatusNoComponent(instance *trustyaiopendatahubiov1alpha1.Trust // Call the reconcileStatuses function _, _ = reconciler.reconcileStatuses(ctx, instance) - readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { Expect(statusMatch).To(Equal(corev1.ConditionFalse), "Ready condition should be true") @@ -127,7 +127,7 @@ var _ = Describe("Status and condition tests", func() { }, instance) }, "failed to get updated instance") - readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true") @@ -191,7 +191,7 @@ var _ = Describe("Status and condition tests", func() { }, instance) }, "failed to get updated instance") - readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true") @@ -260,8 +260,7 @@ var _ = Describe("Status and condition tests", func() { Namespace: instance.Namespace, }, instance) }, "failed to get updated instance") - - readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true") @@ -344,7 +343,7 @@ var _ = Describe("Status and condition tests", func() { }, instance) }, "failed to get updated instance") - readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true") diff --git a/controllers/tas/suite_test.go b/controllers/tas/suite_test.go index 71114243..9c8f0bda 100644 --- a/controllers/tas/suite_test.go +++ b/controllers/tas/suite_test.go @@ -360,14 +360,75 @@ func makeDeploymentReady(ctx context.Context, k8sClient client.Client, instance Reason: "DeploymentReady", Message: "The deployment is ready", }, + { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionTrue, + Reason: "NewReplicaSetAvailable", + Message: "ReplicaSet is progressing", + }, } if deployment.Spec.Replicas != nil { - deployment.Status.ReadyReplicas = 1 - deployment.Status.Replicas = 1 + deployment.Status.ReadyReplicas = *deployment.Spec.Replicas + deployment.Status.Replicas = *deployment.Spec.Replicas + deployment.Status.AvailableReplicas = *deployment.Spec.Replicas + } + + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: instance.Name + "-pod", + Namespace: instance.Namespace, + Labels: deployment.Spec.Selector.MatchLabels, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "trustyai-service", + Image: "quay.io/trustyai/trustyai-service:latest", + Ports: []corev1.ContainerPort{ + { + ContainerPort: 8080, + }, + }, + }, + }, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodReady, + Status: corev1.ConditionTrue, + }, + { + Type: corev1.ContainersReady, + Status: corev1.ConditionTrue, + }, + }, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "trustyai-service", + State: corev1.ContainerState{ + Running: &corev1.ContainerStateRunning{ + StartedAt: metav1.Now(), + }, + }, + Ready: true, + RestartCount: 0, + }, + }, + }, + } + + if err := k8sClient.Create(ctx, pod); err != nil { + return err } - return k8sClient.Update(ctx, deployment) + if err := k8sClient.Status().Update(ctx, deployment); err != nil { + return err + } + + return nil } func makeRouteReady(ctx context.Context, k8sClient client.Client, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) error { diff --git a/controllers/tas/templates/service/deployment.tmpl.yaml b/controllers/tas/templates/service/deployment.tmpl.yaml index f1e429f1..9257b69a 100644 --- a/controllers/tas/templates/service/deployment.tmpl.yaml +++ b/controllers/tas/templates/service/deployment.tmpl.yaml @@ -100,7 +100,7 @@ spec: key: databaseName - name: QUARKUS_DATASOURCE_JDBC_URL {{ if .UseDBTLSCerts }} - value: "jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/${DATABASE_NAME}?sslMode=verify-ca&serverSslCert=/etc/tls/db/tls.crt" + value: "jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/${DATABASE_NAME}?requireSSL=true&sslMode=verify-ca&serverSslCert=/etc/tls/db/ca.crt" {{ else }} value: "jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/${DATABASE_NAME}" {{ end }} @@ -122,6 +122,20 @@ spec: - name: STORAGE_MIGRATION_CONFIG_FROM_FILENAME value: {{ .Instance.Spec.Data.Filename }} {{ end }} + readinessProbe: + httpGet: + path: /q/health/ready + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 2 + livenessProbe: + httpGet: + path: /q/health/live + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 2 volumeMounts: - name: {{ .Instance.Name }}-internal readOnly: false @@ -132,7 +146,7 @@ spec: readOnly: false {{ end }} {{ if .UseDBTLSCerts }} - - name: db-tls-certs + - name: db-ca-cert mountPath: /etc/tls/db readOnly: true {{ end }} @@ -224,8 +238,8 @@ spec: secretName: {{ .Instance.Name }}-internal defaultMode: 420 {{ if .UseDBTLSCerts }} - - name: db-tls-certs + - name: db-ca-cert secret: - secretName: {{ .Instance.Name }}-db-tls + secretName: {{ .Instance.Name }}-db-ca defaultMode: 420 {{ end }} diff --git a/controllers/tas/templates/service/destination-rule.tmpl.yaml b/controllers/tas/templates/service/destination-rule.tmpl.yaml new file mode 100644 index 00000000..f62e548e --- /dev/null +++ b/controllers/tas/templates/service/destination-rule.tmpl.yaml @@ -0,0 +1,13 @@ +apiVersion: networking.istio.io/v1beta1 +kind: DestinationRule +metadata: + name: {{ .DestinationRuleName }} + namespace: {{ .Namespace }} +spec: + host: {{ .Name }}.{{ .Namespace }}.svc.cluster.local + trafficPolicy: + portLevelSettings: + - port: + number: 443 + tls: + mode: SIMPLE diff --git a/controllers/tas/templates/service/virtual-service.tmpl.yaml b/controllers/tas/templates/service/virtual-service.tmpl.yaml new file mode 100644 index 00000000..8356be94 --- /dev/null +++ b/controllers/tas/templates/service/virtual-service.tmpl.yaml @@ -0,0 +1,16 @@ +apiVersion: networking.istio.io/v1beta1 +kind: VirtualService +metadata: + name: {{ .VirtualServiceName }} + namespace: {{ .Namespace }} +spec: + hosts: + - {{ .Name }}.{{ .Namespace }}.svc.cluster.local + http: + - match: + - port: 80 + route: + - destination: + host: {{ .Name }}.{{ .Namespace }}.svc.cluster.local + port: + number: 443 diff --git a/controllers/tas/trustyaiservice_controller.go b/controllers/tas/trustyaiservice_controller.go index 34aca59e..619ca6ac 100644 --- a/controllers/tas/trustyaiservice_controller.go +++ b/controllers/tas/trustyaiservice_controller.go @@ -26,6 +26,7 @@ import ( trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/tas/v1alpha1" "github.com/trustyai-explainability/trustyai-service-operator/controllers/utils" appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -79,6 +80,9 @@ type TrustyAIServiceReconciler struct { //+kubebuilder:rbac:groups="",resources=serviceaccounts,verbs=get;list;watch;create;update;delete //+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterrolebindings,verbs=get;list;watch;create;update;delete //+kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;create;update +//+kubebuilder:rbac:groups=networking.istio.io,resources=destinationrules,verbs=create;list;watch;get;update;patch;delete +//+kubebuilder:rbac:groups=networking.istio.io,resources=virtualservices,verbs=create;list;watch;get;update;patch;delete +//+kubebuilder:rbac:groups=apiextensions.k8s.io,resources=customresourcedefinitions,verbs=list;watch;get // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. @@ -167,10 +171,28 @@ func (r *TrustyAIServiceReconciler) Reconcile(ctx context.Context, req ctrl.Requ // Get database configuration secret, err := r.findDatabaseSecret(ctx, instance) if err != nil { + _, updateErr := r.updateStatus(ctx, instance, func(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + UpdateDBCredentialsNotFound(saved) + UpdateTrustyAIServiceNotAvailable(saved) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse + }) + if updateErr != nil { + return RequeueWithErrorMessage(ctx, err, "Failed to update status") + } return RequeueWithErrorMessage(ctx, err, "Service configured to use database storage but no database configuration found.") } err = r.validateDatabaseSecret(secret) if err != nil { + _, updateErr := r.updateStatus(ctx, instance, func(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + UpdateDBCredentialsError(saved) + UpdateTrustyAIServiceNotAvailable(saved) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse + }) + if updateErr != nil { + return RequeueWithErrorMessage(ctx, err, "Failed to update status") + } return RequeueWithErrorMessage(ctx, err, "Database configuration contains errors.") } } diff --git a/controllers/tas/virtual_service.go b/controllers/tas/virtual_service.go new file mode 100644 index 00000000..188c049c --- /dev/null +++ b/controllers/tas/virtual_service.go @@ -0,0 +1,89 @@ +package tas + +import ( + "context" + "fmt" + "reflect" + + trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/tas/v1alpha1" + templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/tas/templates" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +const ( + virtualServiceTemplatePath = "service/virtual-service.tmpl.yaml" + virtualServiceCDRName = "destinationrules.networking.istio.io" +) + +// DestinationRuleConfig has the variables for the DestinationRule template +type VirtualServiceConfig struct { + Name string + Namespace string + VirtualServiceName string +} + +// isVirtualServiceCRDPresent returns true if the DestinationRule CRD is present, false otherwise +func (r *TrustyAIServiceReconciler) isVirtualServiceCRDPresent(ctx context.Context) (bool, error) { + crd := &apiextensionsv1.CustomResourceDefinition{} + + err := r.Get(ctx, types.NamespacedName{Name: virtualServiceCDRName}, crd) + if err != nil { + if !errors.IsNotFound(err) { + return false, fmt.Errorf("error getting "+virtualServiceCDRName+" CRD: %v", err) + } + // Not found + return false, nil + } + + // Found + return true, nil +} + +func (r *TrustyAIServiceReconciler) ensureVirtualService(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) error { + + virtualServiceName := instance.Name + "-redirect" + + existingVirtualService := &unstructured.Unstructured{} + existingVirtualService.SetKind("VirtualService") + existingVirtualService.SetAPIVersion("networking.istio.io/v1beta1") + + // Check if the DestinationRule already exists + err := r.Get(ctx, types.NamespacedName{Name: virtualServiceName, Namespace: instance.Namespace}, existingVirtualService) + if err == nil { + // DestinationRule exists + return nil + } + + if !errors.IsNotFound(err) { + return fmt.Errorf("failed to check for existing VirtualService: %v", err) + } + + virtualServiceConfig := VirtualServiceConfig{ + Name: instance.Name, + Namespace: instance.Namespace, + VirtualServiceName: virtualServiceName, + } + + var virtualService *unstructured.Unstructured + virtualService, err = templateParser.ParseResource[unstructured.Unstructured](virtualServiceTemplatePath, virtualServiceConfig, reflect.TypeOf(&unstructured.Unstructured{})) + if err != nil { + log.FromContext(ctx).Error(err, "could not parse the VirtualService template") + return err + } + + if err := ctrl.SetControllerReference(instance, virtualService, r.Scheme); err != nil { + return err + } + + err = r.Create(ctx, virtualService) + if err != nil { + return fmt.Errorf("failed to create VirtualService: %v", err) + } + + return nil +} diff --git a/controllers/utils/utils.go b/controllers/utils/utils.go index 12eb8ba6..419ab3b0 100644 --- a/controllers/utils/utils.go +++ b/controllers/utils/utils.go @@ -50,3 +50,8 @@ func GenerateTLSServiceURL(crName string, namespace string) string { func GenerateNonTLSServiceURL(crName string, namespace string) string { return "http://" + crName + "." + namespace + ".svc" } + +// generateKServeLoggerURL generates an logger url for KServe Inference Loggers +func GenerateKServeLoggerURL(crName string, namespace string) string { + return "http://" + crName + "." + namespace + ".svc.cluster.local" +} diff --git a/go.mod b/go.mod index 2bd53028..7c98c900 100644 --- a/go.mod +++ b/go.mod @@ -10,8 +10,6 @@ require ( github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.64.1 github.com/spf13/viper v1.19.0 github.com/stretchr/testify v1.9.0 - google.golang.org/grpc v1.64.0 - google.golang.org/protobuf v1.34.2 k8s.io/api v0.26.4 k8s.io/apimachinery v0.26.4 k8s.io/client-go v0.26.4 @@ -40,6 +38,8 @@ require ( go.opentelemetry.io/otel/trace v1.24.0 // indirect golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect golang.org/x/sync v0.6.0 // indirect + google.golang.org/grpc v1.62.1 // indirect + google.golang.org/protobuf v1.33.0 // indirect gopkg.in/ini.v1 v1.67.0 // indirect ) @@ -114,7 +114,7 @@ require ( gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiextensions-apiserver v0.26.4 // indirect + k8s.io/apiextensions-apiserver v0.26.4 k8s.io/component-base v0.26.4 // indirect k8s.io/klog/v2 v2.100.1 // indirect k8s.io/kube-openapi v0.0.0-20230515203736-54b630e78af5 // indirect diff --git a/go.sum b/go.sum index 40755336..c2add89e 100644 --- a/go.sum +++ b/go.sum @@ -583,8 +583,8 @@ google.golang.org/grpc v1.33.1/go.mod h1:fr5YgcSWrqhRRxogOsw7RzIpsmvOZ6IcH4kBYTp google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc= google.golang.org/grpc v1.36.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU= google.golang.org/grpc v1.40.0/go.mod h1:ogyxbiOoUXAkP+4+xa6PZSE9DZgIHtSpzjDTB9KAK34= -google.golang.org/grpc v1.64.0 h1:KH3VH9y/MgNQg1dE7b3XfVK0GsPSIzJwdF617gUSbvY= -google.golang.org/grpc v1.64.0/go.mod h1:oxjF8E3FBnjp+/gVFYdWacaLDx9na1aqy9oovLpxQYg= +google.golang.org/grpc v1.62.1 h1:B4n+nfKzOICUXMgyrNd19h/I9oH0L1pizfk1d4zSgTk= +google.golang.org/grpc v1.62.1/go.mod h1:IWTG0VlJLCh1SkC58F7np9ka9mx/WNkjl4PGJaiq+QE= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= @@ -600,8 +600,8 @@ google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQ google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= -google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= -google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= +google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= +google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/tests/Dockerfile b/tests/Dockerfile index e39ebdfd..833e6989 100644 --- a/tests/Dockerfile +++ b/tests/Dockerfile @@ -1,11 +1,7 @@ FROM registry.access.redhat.com/ubi8:8.10-1020 - ARG ORG=trustyai-explainability ARG BRANCH=main ARG ODS_CI_REPO=https://github.com/red-hat-data-services/ods-ci -# This git reference should always reference a stable commit from ods-ci that supports ODH -# This hash corresponds to a March 24th, 2023 commit -ARG ODS_CI_GITREF=a8cf770b37caa4ef7ce6596acc8bdd6866cc7772 ARG OC_CLI_URL=https://mirror.openshift.com/pub/openshift-v4/amd64/clients/ocp/4.14.33/openshift-client-linux.tar.gz ENV HOME /root @@ -36,18 +32,19 @@ COPY Pipfile Pipfile.lock $HOME/peak/ ## Grab CI scripts from single-source-of-truth RUN mkdir -p $HOME/peak/operator-tests/trustyai-explainability/ &&\ mkdir $HOME/kfdef/ &&\ - cp -r $HOME/src/trustyai-explainability/tests/resources/ $HOME/peak/operator-tests/trustyai-explainability/resources &&\ - cp $HOME/src/trustyai-explainability/tests/util $HOME/peak/operator-tests/trustyai-explainability &&\ - cp -r $HOME/src/trustyai-explainability/tests/basictests $HOME/peak/operator-tests/trustyai-explainability/basictests &&\ cp -r $HOME/src/trustyai-explainability/tests/setup/odh-*.yaml $HOME/kfdef/ &&\ cp -r $HOME/src/trustyai-explainability/tests/setup/*setup $HOME/peak/ &&\ cp -r $HOME/src/trustyai-explainability/tests/scripts/installandtest.sh $HOME/peak/ +# Install poetry to support the exeuction of trustyai-tests +RUN curl -sSL https://install.python-poetry.org | python3 - +ENV PATH="${PATH}:$HOME/.local/bin" +RUN cd $HOME/peak && \ + git clone https://github.com/trustyai-explainability/trustyai-tests.git && \ + cd trustyai-tests && \ + poetry install + COPY scripts/install.sh $HOME/peak/ -#COPY resources $HOME/peak/operator-tests/trustyai-explainability/resources -#COPY util $HOME/peak/operator-tests/trustyai-explainability -#COPY setup/odh-core.yaml $HOME/kfdef/ -#COPY basictests $HOME/peak/operator-tests/trustyai-explainability/basictests RUN chmod -R 777 $HOME/kfdef && \ mkdir -p $HOME/.kube && \ diff --git a/tests/Makefile b/tests/Makefile index 8fa56988..a82e260d 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -15,8 +15,8 @@ OPENSHIFT_TESTUSER_PASS= OPENSHIFT_TESTUSER_LOGIN_PROVIDER= # Setting SKIP_INSTALL will let you run the tests against an ODH instance that is already setup SKIP_INSTALL= -# Setting TESTS_REGEX will allow you to change which tests are going to be run -TESTS_REGEX= +# Pytest markers to select the tests that will be executed +PYTEST_MARKERS= # Location inside the container where CI system will retrieve files after a test run ARTIFACT_DIR=/tmp/artifacts LOCAL_ARTIFACT_DIR="${PWD}/artifacts" @@ -25,17 +25,19 @@ BUILD_TOOL?=podman NO_CACHE?=false LOCAL?=false TEARDOWN?=false +PLATFORM?=linux/amd64 + all: test test: build run clean build: - ${BUILD_TOOL} build -t $(IMAGE) --build-arg ORG=$(GIT_ORG) --build-arg BRANCH=$(GIT_BRANCH) --build-arg ODS_CI_REPO=$(ODS_CI_REPO) --build-arg ODS_CI_GITREF=$(ODS_CI_GITREF) --build-arg OC_CLI_URL=$(OC_CLI_URL) . + ${BUILD_TOOL} build -t $(IMAGE) --build-arg ORG=$(GIT_ORG) --build-arg BRANCH=$(GIT_BRANCH) --build-arg ODS_CI_REPO=$(ODS_CI_REPO) --build-arg ODS_CI_GITREF=$(ODS_CI_GITREF) --build-arg OC_CLI_URL=$(OC_CLI_URL) --platform=$(PLATFORM) . --progress=plain run: # Confirm that we have a directory for storing any screenshots from selenium tests mkdir -p ${LOCAL_ARTIFACT_DIR}/screenshots oc config view --flatten --minify > /tmp/tests-kubeconfig - ${BUILD_TOOL} run -e SKIP_INSTALL=$(SKIP_INSTALL) -e TESTS_REGEX=$(TESTS_REGEX) -e SKIP_OPERATOR_INSTALL=$(SKIP_OPERATOR_INSTALL) \ + ${BUILD_TOOL} run -e SKIP_INSTALL=$(SKIP_INSTALL) -e PYTEST_MARKERS=$(PYTEST_MARKERS) -e SKIP_OPERATOR_INSTALL=$(SKIP_OPERATOR_INSTALL) \ -e SKIP_KFDEF_INSTALL=$(SKIP_KFDEF_INSTALL) -e ODHPROJECT=$(ODHPROJECT) \ -e OPENSHIFT_TESTUSER_NAME="$(OPENSHIFT_TESTUSER_NAME)" -e OPENSHIFT_TESTUSER_PASS="$(OPENSHIFT_TESTUSER_PASS)" -e OPENSHIFT_TESTUSER_LOGIN_PROVIDER=$(OPENSHIFT_TESTUSER_LOGIN_PROVIDER) -e ARTIFACT_DIR=$(ARTIFACT_DIR) \ -e LOCAL=$(LOCAL) -e TEARDOWN=$(TEARDOWN)\ diff --git a/tests/README.md b/tests/README.md index 3dcbd3f3..4864e3aa 100644 --- a/tests/README.md +++ b/tests/README.md @@ -24,7 +24,7 @@ make clean # remove the artifacts of the test from the cluster (operator, ODH, p * `BUILD_TOOL=docker/podman`: set the tool used to build and run the testing container * `SKIP_INSTALL=true/false`: skip the install of the ODH operator, if you've already installed it manually or via a previous test * `SKIP_KFDEF_INSTALL=true/false`: skip the install of ODH via KFdef, if you've already installed it manually or via a previous test -* `TESTS_REGEX=${REGEX}`: only run tests whose names match the regex +* `PYTEST_MARKERS`: Used to select the tests that will be executed. [Available markers](https://github.com/trustyai-explainability/trustyai-tests/blob/main/pyproject.toml). * `LOCAL=true/false`: This flag makes the test suite stop and wait for user input between the end of a test script and cluster teardown. This prevents automatic teardown, which is useful for manual inspection of the cluster before teardown when running the tests locally. * `TEARDOWN=true/false`: This flag will just run the corresponding `teardown` functions within the various tests, useful for cleaning up stranded components from failed tests, without deleting the operator and ODH install. It's recommended to use this with a `make run`, as using `make test` will trigger a `make clean` that fully wipes the cluster. @@ -47,10 +47,6 @@ If you'd like to run the tests against an instance that already has a KfDef crea you set `SKIP_KFDEF_INSTALL=true` and that will cause the test run to skip the step of creating the default KfDef. example: `make run SKIP_KFDEF_INSTALL=true` -If you'd like to run a single test instead of all tests, you can -set the TESTS_REGEX variable `TESTS_REGEX=`. That will -only run the test that you specify instead of all of the tests. example: `make run TESTS_REGEX=grafana` - If you have a local instance already running the operator and you'd like to skip that part of the install process, you can set `SKIP_OPERATOR_INSTALL=true` and that will bypass installation of the operator, but will still install the authentication for any user tests. diff --git a/tests/scripts/install.sh b/tests/scripts/install.sh index cc47a549..06eb6ccf 100755 --- a/tests/scripts/install.sh +++ b/tests/scripts/install.sh @@ -128,13 +128,23 @@ else echo "Creating the following DSC" cat ./${DSC_FILENAME} > ${ARTIFACT_DIR}/${DSC_FILENAME} - oc apply -f ./odh-core-dsci.yaml - oc apply -f ./${DSC_FILENAME} - kfctl_result=$? - if [ "$kfctl_result" -ne 0 ]; then + start_t=$(date +%s) 2>&1 + ready=1 2>&1 + while [ "$ready" -ne 0 ]; do + oc apply -f ./odh-core-dsci.yaml + oc apply -f ./${DSC_FILENAME} + ready=$? + if [ $(($(date +%s)-start_t)) -gt 300 ]; then + echo "ODH DSC Installation timeout" + exit 1 + fi + sleep 10 + done + + if [ "$ready" -ne 0 ]; then echo "The installation failed" - exit $kfctl_result + exit $ready fi fi set +x