config/runtimes/triton-2.x.yaml

# Copyright 2021 IBM Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: serving.kserve.io/v1alpha1
kind: ClusterServingRuntime
metadata:
  name: triton-2.x
  labels:
    name: modelmesh-serving-triton-2.x-SR
  annotations:
    maxLoadingConcurrency: "2"
spec:
  supportedModelFormats:
    - name: keras
      version: "2" # 2.6.0
      autoSelect: true
    - name: onnx
      version: "1" # 1.5.3
      autoSelect: true
    - name: pytorch
      version: "1" # 1.8.0a0+17f8c32
      autoSelect: true
    - name: tensorflow
      version: "1" # 1.15.4
      autoSelect: true
    - name: tensorflow
      version: "2" # 2.3.1
      autoSelect: true
    - name: tensorrt
      version: "7" # 7.2.1
      autoSelect: true
    - name: sklearn
      version: "0" # v0.23.1
      autoSelect: false
    - name: xgboost
      version: "1" # v1.1.1
      autoSelect: false
    - name: lightgbm
      version: "3" # v3.2.1
      autoSelect: false

  protocolVersions:
    - grpc-v2
  multiModel: true

  grpcEndpoint: "port:8085"
  grpcDataEndpoint: "port:8001"

  containers:
    - name: triton
      image: tritonserver-2:replace
      command: [/bin/sh]
      args:
        - -c
        - 'mkdir -p /models/_triton_models;
          chmod 777 /models/_triton_models;
          exec tritonserver
          "--model-repository=/models/_triton_models"
          "--model-control-mode=explicit"
          "--strict-model-config=false"
          "--strict-readiness=false"
          "--allow-http=true"
          "--allow-sagemaker=false"
          '
      resources:
        requests:
          cpu: 500m
          memory: 1Gi
        limits:
          cpu: "5"
          memory: 1Gi
      livenessProbe:
        # the server is listening only on 127.0.0.1, so an httpGet probe sent
        # from the kublet running on the node cannot connect to the server
        # (not even with the Host header or host field)
        # exec a curl call to have the request originate from localhost in the
        # container
        exec:
          command:
            - curl
            - --fail
            - --silent
            - --show-error
            - --max-time
            - "9"
            - http://localhost:8000/v2/health/live
        initialDelaySeconds: 5
        periodSeconds: 30
        timeoutSeconds: 10
  builtInAdapter:
    serverType: triton
    runtimeManagementPort: 8001
    memBufferBytes: 134217728
    modelLoadingTimeoutMillis: 90000