Add phase 1 implementation of new storage spec (kubeflow#1899)

* Add phase 1 implementation of new storage spec Signed-off-by: tomcli <tommy.chaoping.li@ibm.com> * Storagespec refactoring * Some refactoring - Changed GetStorageURI func to always return the storageUri field and not impute from StorageSpec information - Instead have the addStorageSpecAnnotations func set the storage uri annotation if a path is specified - Deduplicated logic in predictor.go which sets the storage uri annotation from explicit storageUri field, and fail if the annotation is already set (i.e. if both storage.path and storageUri are specified) - Renamed CreateStorageSpecSecretVolumeAndEnv func to CreateStorageSpecSecretEnvs and refactored the logic to fail if an explicit secret storageKey is not found, but not if it's a default one - Renamed default storage keys to look up to "default" and "default_s3" depending on whether storage type information was provided - Moved some string names to constants - Removed now-not-applicable predictor_test.go and explainer_test.go - Extended tests in storage_initializer_injector_test.go to cover StorageSpec cases * Also check for existence of StorageSpec in IsMMSPredictor func Co-authored-by: Nick Hill <nickhill@us.ibm.com> Signed-off-by: tomcli <tommy.chaoping.li@ibm.com> * update python server with the new env variables Signed-off-by: tomcli <tommy.chaoping.li@ibm.com> * Update example due to the recent sklearn library upgrade Signed-off-by: tomcli <tommy.chaoping.li@ibm.com> * change static int to compute from variable length Signed-off-by: tomcli <tommy.chaoping.li@ibm.com> * abstract predictor and explainer validate function. Also add storageSpec validation Signed-off-by: tomcli <tommy.chaoping.li@ibm.com> * remove reduntant storageSpec check in the service_account_credentials.go Signed-off-by: tomcli <tommy.chaoping.li@ibm.com> * Add aws anonymous mode flog Signed-off-by: tomcli <tommy.chaoping.li@ibm.com> * generate python sdk and add storagespec s3 e2e test Signed-off-by: tomcli <tommy.chaoping.li@ibm.com> * update ci overlays Signed-off-by: tomcli <tommy.chaoping.li@ibm.com> * Proxy minio service to CI client. Signed-off-by: tomcli <tommy.chaoping.li@ibm.com> * move minio model init to a k8s job Signed-off-by: tomcli <tommy.chaoping.li@ibm.com> * deploy minio job with permitted ns Signed-off-by: tomcli <tommy.chaoping.li@ibm.com> * Add non-codegen import Signed-off-by: tomcli <tommy.chaoping.li@ibm.com> * clean up new merges Signed-off-by: tomcli <tommy.chaoping.li@ibm.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com>
magdalenakuhn17 · Feb 22, 2022 · 3ac7982 · 3ac7982
1 parent ab2896d
commit 3ac7982
Show file tree

Hide file tree

Showing 85 changed files with 13,410 additions and 11,333 deletions.
diff --git a/config/configmap/inferenceservice.yaml b/config/configmap/inferenceservice.yaml
@@ -140,7 +140,8 @@ data:
         "memoryRequest": "100Mi",
         "memoryLimit": "1Gi",
         "cpuRequest": "100m",
-        "cpuLimit": "1"
+        "cpuLimit": "1",
+        "storageSpecSecretName": "storage-config"
     }
   credentials: |-
     {

diff --git a/config/crd/serving.kserve.io_inferenceservices.yaml b/config/crd/serving.kserve.io_inferenceservices.yaml
diff --git a/config/overlays/test/kustomization.yaml b/config/overlays/test/kustomization.yaml
@@ -1,5 +1,6 @@
 bases:
   - ../../default
+  - minio
 
 patches:
   - configmap/inferenceservice.yaml

diff --git a/config/overlays/test/minio/kustomization.yaml b/config/overlays/test/minio/kustomization.yaml
@@ -0,0 +1,9 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: kserve
+
+resources:
+- minio-deployment.yaml
+- minio-service.yaml
+- mlpipeline-minio-artifact-secret.yaml
diff --git a/config/overlays/test/minio/minio-deployment.yaml b/config/overlays/test/minio/minio-deployment.yaml
@@ -0,0 +1,47 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: minio
+  labels:
+    app: minio
+spec:
+  selector:
+    matchLabels:
+      app: minio
+  strategy:
+    type: Recreate
+  template:
+    metadata:
+      labels:
+        app: minio
+    spec:
+      containers:
+      - args:
+        - server
+        - /data
+        env:
+        - name: MINIO_ACCESS_KEY
+          valueFrom:
+            secretKeyRef:
+              name: mlpipeline-minio-artifact
+              key: accesskey
+        - name: MINIO_SECRET_KEY
+          valueFrom:
+            secretKeyRef:
+              name: mlpipeline-minio-artifact
+              key: secretkey
+        image: gcr.io/ml-pipeline/minio:RELEASE.2019-08-14T20-37-41Z-license-compliance
+        name: minio
+        ports:
+        - containerPort: 9000
+        volumeMounts:
+        - mountPath: /data
+          name: data
+          subPath: minio
+        resources:
+          requests:
+            cpu: 20m
+            memory: 100Mi
+      volumes:
+      - name: data
+        emptyDir: {}
diff --git a/config/overlays/test/minio/minio-init-job.yaml b/config/overlays/test/minio/minio-init-job.yaml
@@ -0,0 +1,20 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: minio-init
+spec:
+  template:
+    spec:
+      containers:
+      - name: minio-init
+        image: minio/mc:RELEASE.2020-11-25T23-04-07Z
+        command:
+        - /bin/sh
+        - -c
+        - |
+          mc config host add storage http://minio-service.kserve:9000 minio minio123
+          mc mb storage/example-models
+          curl -L https://storage.googleapis.com/kfserving-examples/models/sklearn/1.0/model/model.joblib -o sklearn-model.joblib
+          mc cp sklearn-model.joblib storage/example-models/sklearn/model.joblib
+      restartPolicy: Never
+  backoffLimit: 3
diff --git a/config/overlays/test/minio/minio-service.yaml b/config/overlays/test/minio/minio-service.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: minio-service
+spec:
+  ports:
+  - name: http
+    port: 9000
+    protocol: TCP
+    targetPort: 9000
+  selector:
+    app: minio
diff --git a/config/overlays/test/minio/minio-user-secret.yaml b/config/overlays/test/minio/minio-user-secret.yaml
@@ -0,0 +1,16 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: storage-config
+type: Opaque
+stringData:
+  localMinIO: |
+    {
+      "type": "s3",
+      "access_key_id": "minio",
+      "secret_access_key": "minio123",
+      "endpoint_url": "http://minio-service.kserve:9000",
+      "bucket": "mlpipeline",
+      "region": "us-south",
+      "anonymous": "False"
+    }
diff --git a/config/overlays/test/minio/mlpipeline-minio-artifact-secret.yaml b/config/overlays/test/minio/mlpipeline-minio-artifact-secret.yaml
@@ -0,0 +1,8 @@
+kind: Secret
+apiVersion: v1
+metadata:
+  name: mlpipeline-minio-artifact
+stringData:
+  accesskey: minio
+  secretkey: minio123
+
diff --git a/docs/samples/storage/storageSpec/README.md b/docs/samples/storage/storageSpec/README.md
@@ -0,0 +1,100 @@
+# Predict on a InferenceService with StorageSpec
+## Setup
+1. Your `~/.kube/config` should point to a cluster with [KServe installed](https://github.com/kserve/kserve#installation).
+2. Your cluster's Istio Ingress gateway must be [network accessible](https://istio.io/latest/docs/tasks/traffic-management/ingress/ingress-control/).
+3. Your cluster's Istio Egresss gateway must [allow accessing S3 Storage](https://knative.dev/docs/serving/outbound-network-access/)
+
+## Create Common Storage Secret
+Create a common secret with your [storage credential](https://console.aws.amazon.com/iam/home#/users), `KServe` reads the secret key to inject
+the necessary storage volume on storage initializer or model agent to download the models from the destination storage.
+```yaml
+apiVersion: v1
+stringData:
+  localMinIO: |
+    {
+      "type": "s3",
+      "access_key_id": "minio",
+      "secret_access_key": "minio123",
+      "endpoint_url": "http://minio-service.kubeflow:9000",
+      "bucket": "mlpipeline",
+      "region": "us-south",
+      "anonymous": "False"
+    }
+kind: Secret
+metadata:
+  name: storage-config
+type: Opaque
+```
+
+Apply the secret and service account
+```bash
+kubectl apply -f common_secret.yaml
+```
+
+Then, download the [sklearn model.joblib](https://console.cloud.google.com/storage/browser/kfserving-examples/models/sklearn/1.0/model) and store the model at the path `sklearn/model.joblib` inside the a new bucket called `example-models`.
+
+Note: if you are running kserve with istio sidecars enabled, there can be a race condition between the istio proxy being ready and the agent pulling models.
+This will result in a `tcp dial connection refused` error when the agent tries to download from s3.
+To resolve it, istio allows the blocking of other containers in a pod until the proxy container is ready.
+You can enabled this by setting `proxy.holdApplicationUntilProxyStarts: true` in `istio-sidecar-injector` configmap,
+`proxy.holdApplicationUntilProxyStarts` flag was introduced in Istio 1.7 as an experimental feature and is turned off by default.
+
+## Create the InferenceService
+Create the InferenceService with the storage spec and select the credential entry.
+```yaml
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  name: example-sklearn-isvc
+spec:
+  predictor:
+    sklearn:
+      storage:
+        key: localMinIO # Credential key for the destination storage in the common secret
+        path: sklearn # Model path inside the bucket
+        # schemaPath: null # Optional schema files for payload schema
+        parameters: # Parameters to override the default values inside the common secret.
+          bucket: example-models
+```
+
+```bash
+kubectl apply -f sklearn_storagespec.yaml
+```
+
+Expected Output
+```
+$ inferenceservice.serving.kserve.io/example-sklearn-isvc created
+```
+
+## Run a prediction
+The first step is to [determine the ingress IP and ports](../../../../README.md#determine-the-ingress-ip-and-ports) and set `INGRESS_HOST` and `INGRESS_PORT`
+
+```bash
+MODEL_NAME=example-sklearn-isvc
+INPUT_PATH=@./iris-input.json
+SERVICE_HOSTNAME=$(kubectl get inferenceservice ${MODEL_NAME} -o jsonpath='{.status.url}' | cut -d "/" -f 3)
+curl -v -H "Host: ${SERVICE_HOSTNAME}" http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/$MODEL_NAME:predict -d $INPUT_PATH
+```
+Expected Output
+```
+*   Trying 169.45.97.44:80...
+* Connected to 169.45.97.44 (169.45.97.44) port 80 (#0)
+> POST /v1/models/example-sklearn-isvc:predict HTTP/1.1
+> Host: example-sklearn-isvc.default.example.com
+> User-Agent: curl/7.71.1
+> Accept: */*
+> Content-Length: 76
+> Content-Type: application/x-www-form-urlencoded
+>
+* upload completely sent off: 76 out of 76 bytes
+* Mark bundle as not supporting multiuse
+< HTTP/1.1 200 OK
+< content-length: 23
+< content-type: application/json; charset=UTF-8
+< date: Thu, 04 Nov 2021 23:17:45 GMT
+< server: istio-envoy
+< x-envoy-upstream-service-time: 12
+<
+* Connection #0 to host 169.45.97.44 left intact
+{"predictions": [1, 1]}
+```
diff --git a/docs/samples/storage/storageSpec/common_secret.yaml b/docs/samples/storage/storageSpec/common_secret.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+stringData:
+  localMinIO: |
+    {
+      "type": "s3",
+      "access_key_id": "minio",
+      "secret_access_key": "minio123",
+      "endpoint_url": "http://minio-service.kubeflow:9000",
+      "bucket": "mlpipeline",
+      "region": "us-south"
+    }
+kind: Secret
+metadata:
+  name: storage-config
+type: Opaque
diff --git a/docs/samples/storage/storageSpec/iris-input.json b/docs/samples/storage/storageSpec/iris-input.json
@@ -0,0 +1,6 @@
+{
+  "instances": [
+    [6.8,  2.8,  4.8,  1.4],
+    [6.0,  3.4,  4.5,  1.6]
+  ]
+}
diff --git a/docs/samples/storage/storageSpec/sklearn_storagespec.yaml b/docs/samples/storage/storageSpec/sklearn_storagespec.yaml
@@ -0,0 +1,13 @@
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  name: example-sklearn-isvc
+spec:
+  predictor:
+    sklearn:
+      storage:
+        key: localMinIO
+        path: sklearn
+        # schemaPath: null
+        parameters:
+          bucket: example-models
diff --git a/hack/violation_exceptions.list b/hack/violation_exceptions.list
@@ -37,4 +37,5 @@ API rule violation: names_match,./pkg/apis/serving/v1beta1,PredictorsConfig,Ligh
 API rule violation: names_match,./pkg/apis/serving/v1beta1,PredictorsConfig,PyTorch
 API rule violation: names_match,./pkg/apis/serving/v1beta1,PredictorsConfig,SKlearn
 API rule violation: names_match,./pkg/apis/serving/v1beta1,PredictorsConfig,XGBoost
+API rule violation: names_match,./pkg/apis/serving/v1beta1,StorageSpec,StorageKey
 API rule violation: names_match,./pkg/apis/serving/v1beta1,TransformerConfig,ContainerImage
diff --git a/pkg/apis/serving/v1beta1/component.go b/pkg/apis/serving/v1beta1/component.go
@@ -36,6 +36,7 @@ const (
 	MaxReplicasLowerBoundExceededError  = "MaxReplicas cannot be less than 0."
 	ParallelismLowerBoundExceededError  = "Parallelism cannot be less than 0."
 	UnsupportedStorageURIFormatError    = "storageUri, must be one of: [%s] or match https://{}.blob.core.windows.net/{}/{} or be an absolute or relative local path. StorageUri [%s] is not supported."
+	UnsupportedStorageSpecFormatError   = "storage.spec.type, must be one of: [%s]. storage.spec.type [%s] is not supported."
 	InvalidLoggerType                   = "Invalid logger type"
 	InvalidISVCNameFormatError          = "The InferenceService \"%s\" is invalid: a InferenceService name must consist of lower case alphanumeric characters or '-', and must start with alphabetical character. (e.g. \"my-name\" or \"abc-123\", regex used for validation is '%s')"
 	MaxWorkersShouldBeLessThanMaxError  = "Workers cannot be greater than %d"
@@ -45,9 +46,10 @@ const (
 
 // Constants
 var (
-	SupportedStorageURIPrefixList = []string{"gs://", "s3://", "pvc://", "file://", "https://", "http://"}
-	AzureBlobURL                  = "blob.core.windows.net"
-	AzureBlobURIRegEx             = "https://(.+?).blob.core.windows.net/(.+)"
+	SupportedStorageURIPrefixList     = []string{"gs://", "s3://", "pvc://", "file://", "https://", "http://"}
+	SupportedStorageSpecURIPrefixList = []string{"s3://"}
+	AzureBlobURL                      = "blob.core.windows.net"
+	AzureBlobURIRegEx                 = "https://(.+?).blob.core.windows.net/(.+)"
 )
 
 // ComponentImplementation interface is implemented by predictor, transformer, and explainer implementations
@@ -57,6 +59,7 @@ type ComponentImplementation interface {
 	Validate() error
 	GetContainer(metadata metav1.ObjectMeta, extensions *ComponentExtensionSpec, config *InferenceServicesConfig) *v1.Container
 	GetStorageUri() *string
+	GetStorageSpec() *StorageSpec
 	GetProtocol() constants.InferenceServiceProtocol
 	IsMMS(config *InferenceServicesConfig) bool
 }
@@ -107,6 +110,31 @@ func (s *ComponentExtensionSpec) Validate() error {
 	})
 }
 
+func validateStorageSpec(storageSpec *StorageSpec, storageURI *string) error {
+	if storageSpec == nil {
+		return nil
+	}
+	if storageSpec != nil && storageURI != nil {
+		if utils.IsPrefixSupported(*storageURI, SupportedStorageSpecURIPrefixList) {
+			return nil
+		} else {
+			return fmt.Errorf(UnsupportedStorageURIFormatError, strings.Join(SupportedStorageSpecURIPrefixList, ", "), *storageURI)
+		}
+	}
+	if storageSpec.Parameters != nil {
+		for k, v := range *storageSpec.Parameters {
+			if k == "type" {
+				if utils.IsPrefixSupported(v+"://", SupportedStorageSpecURIPrefixList) {
+					return nil
+				} else {
+					return fmt.Errorf(UnsupportedStorageSpecFormatError, strings.Join(SupportedStorageSpecURIPrefixList, ", "), v)
+				}
+			}
+		}
+	}
+	return nil
+}
+
 func validateStorageURI(storageURI *string) error {
 	if storageURI == nil {
 		return nil

diff --git a/pkg/apis/serving/v1beta1/explainer.go b/pkg/apis/serving/v1beta1/explainer.go
@@ -16,7 +16,10 @@ limitations under the License.
 
 package v1beta1
 
-import v1 "k8s.io/api/core/v1"
+import (
+	"github.com/kserve/kserve/pkg/utils"
+	v1 "k8s.io/api/core/v1"
+)
 
 // ExplainerSpec defines the container spec for a model explanation server,
 // The following fields follow a "1-of" semantic. Users must specify exactly one spec.
@@ -49,10 +52,34 @@ type ExplainerExtensionSpec struct {
 	// Each framework will have different defaults that are populated in the underlying container spec.
 	// +optional
 	v1.Container `json:",inline"`
+	// Storage Spec for model location
+	// +optional
+	Storage *StorageSpec `json:"storage,omitempty"`
 }
 
 var _ Component = &ExplainerSpec{}
 
+// Validate returns an error if invalid
+func (e *ExplainerExtensionSpec) Validate() error {
+	return utils.FirstNonNilError([]error{
+		validateStorageURI(e.GetStorageUri()),
+		validateStorageSpec(e.GetStorageSpec(), e.GetStorageUri()),
+	})
+}
+
+// GetStorageUri returns the predictor storage Uri
+func (e *ExplainerExtensionSpec) GetStorageUri() *string {
+	if e.StorageURI != "" {
+		return &e.StorageURI
+	}
+	return nil
+}
+
+// GetStorageSpec returns the predictor storage spec object
+func (e *ExplainerExtensionSpec) GetStorageSpec() *StorageSpec {
+	return e.Storage
+}
+
 // GetImplementations returns the implementations for the component
 func (s *ExplainerSpec) GetImplementations() []ComponentImplementation {
 	implementations := NonNilComponents([]ComponentImplementation{