Skip to content

Commit

Permalink
Sherif akoush/metrics dashboard (#241)
Browse files Browse the repository at this point in the history
* fix make file in api

* tidy up http connection

* modified dashboard

* add idle connection timeout

* fix unit

* fix lint

* make secret optional

* make bucket name configurable
  • Loading branch information
sakoush authored May 26, 2022
1 parent af0543d commit 1634537
Show file tree
Hide file tree
Showing 7 changed files with 65 additions and 37 deletions.
3 changes: 1 addition & 2 deletions apis/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@ build:
./mlops/scheduler/scheduler.proto \
./mlops/scheduler/storage.proto \
./mlops/chainer/chainer.proto \
./mlops/v2_dataplane/v2_dataplane.proto \
./mlops/v2_model_repository/v2_model_repository.proto
./mlops/v2_dataplane/v2_dataplane.proto


PROTOC_PYTHON_OUT := --python_out=./python
Expand Down
63 changes: 41 additions & 22 deletions prometheus/dashboards/Seldon Core Model Mesh Monitoring.json
Original file line number Diff line number Diff line change
Expand Up @@ -520,9 +520,21 @@
"intervalFactor": 1,
"legendFormat": "Evict Rate",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"exemplar": true,
"expr": "sum(rate(seldon_mesh_seldon_cache_miss_count[1m]))",
"hide": false,
"interval": "",
"legendFormat": "Miss Rate",
"refId": "B"
}
],
"title": "Model Evict Rate [1m]",
"title": "Model Evict/Miss Rate [1m]",
"type": "timeseries"
},
{
Expand Down Expand Up @@ -586,7 +598,7 @@
"x": 8,
"y": 5
},
"id": 13,
"id": 20,
"options": {
"legend": {
"calcs": [],
Expand All @@ -605,16 +617,31 @@
"uid": "P1809F7CD0C75ACF3"
},
"exemplar": true,
"expr": "sum(rate(seldon_mesh_seldon_cache_miss_count[1m]))",
"expr": "sum by (server) (rate(seldon_mesh_seldon_load_model_counter[1m]))",
"format": "time_series",
"instant": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "Miss Rate",
"legendFormat": "{{server}}_Load",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"exemplar": true,
"expr": "sum by (server) (rate(seldon_mesh_seldon_unload_model_counter[1m]))",
"format": "time_series",
"hide": false,
"instant": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "Unloa{{server}}_Loadd",
"refId": "B"
}
],
"title": "Model Miss Rate [1m]",
"title": "Model Load/Unload Rate [1m]",
"type": "timeseries"
},
{
Expand Down Expand Up @@ -772,6 +799,9 @@
"viz": false
},
"lineInterpolation": "linear",
"lineStyle": {
"fill": "solid"
},
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
Expand Down Expand Up @@ -831,26 +861,15 @@
"uid": "P1809F7CD0C75ACF3"
},
"exemplar": true,
"expr": "quantile(0.9, \n(\nrate(seldon_mesh_seldon_infer_seconds_total{container=\"agent\"}[1m]) / rate(seldon_mesh_seldon_infer_total{container=\"agent\"}[1m])\n)\n) by (server)",
"expr": "avg((rate(seldon_mesh_seldon_aggregate_infer_seconds_total{container=\"agent\"}[1m]) / rate(seldon_mesh_seldon_aggregate_infer_total{container=\"agent\"}[1m])) > 0 ) by (server, method_type)",
"hide": false,
"interval": "",
"legendFormat": "{{server}}_90%",
"legendFormat": "{{server}}_{{method_type}}_avg",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"exemplar": true,
"expr": "quantile(0.5, \n(\nrate(seldon_mesh_seldon_infer_seconds_total{container=\"agent\"}[10m]) / rate(seldon_mesh_seldon_infer_total{container=\"agent\"}[10m])\n)\n) by (server)",
"hide": false,
"interval": "",
"legendFormat": "{{server}}_50%",
"refId": "B"
}
],
"title": "Infer Latency (avg[1m])",
"title": "Infer Latency [1m]",
"transformations": [],
"type": "timeseries"
},
{
Expand Down Expand Up @@ -1053,7 +1072,7 @@
"refId": "B"
}
],
"title": "CPU Rate [1m]",
"title": "CPU [1m]",
"type": "timeseries"
}
],
Expand All @@ -1072,6 +1091,6 @@
"timezone": "",
"title": "Seldon Core Model Mesh Monitoring",
"uid": "MHloCP_7z",
"version": 9,
"version": 22,
"weekStart": ""
}
2 changes: 1 addition & 1 deletion scheduler/cmd/agent/cli/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ func makeArgs() {
flag.IntVar(
&ReverseProxyHttpPort,
flagReverseProxyHttpPort,
agent.ReverseProxyHTTPPort,
agent.DefaultReverseProxyHTTPPort,
"Reverse proxy http port",
)
flag.IntVar(
Expand Down
14 changes: 8 additions & 6 deletions scheduler/pkg/agent/rproxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@ import (
)

const (
ReverseProxyHTTPPort = 9999
maxIdleConnsHTTP = 10
maxIdleConnsPerHostHTTP = 10
disableKeepAlivesHTTP = false
maxConnsPerHostHTTP = 10
defaultTimeoutSeconds = 5
DefaultReverseProxyHTTPPort = 9999
maxIdleConnsHTTP = 10
maxIdleConnsPerHostHTTP = 10
disableKeepAlivesHTTP = false
maxConnsPerHostHTTP = 20
defaultTimeoutSeconds = 5
idleConnTimeoutSeconds = 60
)

type reverseHTTPProxy struct {
Expand Down Expand Up @@ -84,6 +85,7 @@ func (rp *reverseHTTPProxy) Start() error {
MaxIdleConnsPerHost: maxIdleConnsPerHostHTTP,
DisableKeepAlives: disableKeepAlivesHTTP,
MaxConnsPerHost: maxConnsPerHostHTTP,
IdleConnTimeout: idleConnTimeoutSeconds * time.Second,
}
rp.logger.Infof("Start reverse proxy on port %d for %s", rp.port, backend)
rp.server = &http.Server{Addr: ":" + strconv.Itoa(int(rp.port)), Handler: rp.addHandlers(proxy)}
Expand Down
1 change: 1 addition & 0 deletions scheduler/pkg/agent/v2.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ func NewV2Client(host string, port int, logger log.FieldLogger, isGrpc bool) *V2
MaxIdleConnsPerHost: maxIdleConnsPerHostHTTP,
DisableKeepAlives: disableKeepAlivesHTTP,
MaxConnsPerHost: maxConnsPerHostHTTP,
IdleConnTimeout: idleConnTimeoutSeconds * time.Second,
}
netClient := &http.Client{
Timeout: time.Second * defaultTimeoutSeconds,
Expand Down
11 changes: 7 additions & 4 deletions tests/k6/configs/k8s/base/k6.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ spec:
"--out",
"csv=results/base.gz",
"-u",
"50",
"5",
"-i",
"10000",
"-d",
Expand All @@ -33,17 +33,19 @@ spec:
- name: INFER_GRPC_ITERATIONS
value: "1"
- name: MODELNAME_PREFIX
value: "model"
value: "tfsimpleu"
- name: MODEL_TYPE
value: "tfsimple"
- name: MODEL_MEMORY_BYTES
value: "1000000"
value: "500000"
- name: MAX_NUM_MODELS
value: "800"
value: "20000"
- name: INFER_BATCH_SIZE
value: "1"
- name: GOOGLE_APPLICATION_CREDENTIALS
value: "/var/run/secret/cloud.google.com/k6-service-account.json"
- name: GS_BUCKET_NAME
value: "gs://seldon-tmp/scv2-k6-results"
volumeMounts:
- name: "service-account"
mountPath: "/var/run/secret/cloud.google.com"
Expand All @@ -54,6 +56,7 @@ spec:
- name: "service-account"
secret:
secretName: "k6-sa-key"
optional: true
- name: podinfo
downwardAPI:
items:
Expand Down
8 changes: 6 additions & 2 deletions tests/k6/k6wrapper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,16 @@ LABELS=$(cat /info/labels)
# extracts controller id from the labels:
# eg: controller-uid="95a4c449-5cda-45a0-93e1-177caacc3639" job-name="k6"
JOBID=$(echo $LABELS | sed -n 's/.*controller-uid="\([a-zA-Z0-9-]\+\)".*/\1/p')
gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS
if [ -f "$GOOGLE_APPLICATION_CREDENTIALS" ]; then
gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS
fi
echo "start:"$(date) > $DIR/$METADATA
k6 $@
echo "end:"$(date) >> $DIR/$METADATA
echo "args:"$@ >> $DIR/$METADATA
echo "envs:"$(printenv) >> $DIR/$METADATA
echo "metadata:"$TEST_METADATA >> $DIR/$METADATA
echo "labels:"$LABELS >> $DIR/$METADATA
gsutil cp -r $DIR gs://seldon-tmp/scv2-k6-results/${TEST_METADATA}_${JOBID}_${NOW}_${UUID}
if [ -f "$GOOGLE_APPLICATION_CREDENTIALS" ]; then
gsutil cp -r $DIR ${GS_BUCKET_NAME}/${TEST_METADATA}_${JOBID}_${NOW}_${UUID}
fi

0 comments on commit 1634537

Please sign in to comment.