From 760584be478a2e60ca075206d88ec886aa247286 Mon Sep 17 00:00:00 2001 From: Somasundaram Date: Mon, 3 Jun 2024 13:01:51 -0700 Subject: [PATCH] Convert cuda env tgi variables to lmi --- .../docker/dockerd-entrypoint-with-cuda-compat.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/serving/docker/dockerd-entrypoint-with-cuda-compat.sh b/serving/docker/dockerd-entrypoint-with-cuda-compat.sh index 17c2682c1..9a051f649 100644 --- a/serving/docker/dockerd-entrypoint-with-cuda-compat.sh +++ b/serving/docker/dockerd-entrypoint-with-cuda-compat.sh @@ -47,6 +47,18 @@ translateTGIToLMI "SM_NUM_GPUS" "TENSOR_PARALLEL_DEGREE" translateTGIToLMI "MAX_CONCURRENT_REQUESTS" "SERVING_JOB_QUEUE_SIZE" translateTGIToLMI "MAX_BATCH_PREFILL_TOKENS" "OPTION_MAX_ROLLING_BATCH_PREFILL_TOKENS" translateTGIToLMI "MAX_BATCH_SIZE" "OPTION_MAX_ROLLING_BATCH_SIZE" +if [[ -n "$ENABLE_CUDA_GRAPHS" && -z "$OPTION_ENFORCE_EAGER" ]]; then + if [[ "$ENABLE_CUDA_GRAPHS" = true ]]; then + export "OPTION_ENFORCE_EAGER"=false + else + export "OPTION_ENFORCE_EAGER"=true + fi +fi +if [[ "$SERVING_FEATURES" = "trtllm" ]]; then + translateTGIToLMI "CUDA_MEMORY_FRACTION" "OPTION_KV_CACHE_FREE_GPU_MEM_FRACTION" +else + translateTGIToLMI "CUDA_MEMORY_FRACTION" "OPTION_GPU_MEMORY_UTILIZATION" +fi if [[ "$1" = "serve" ]]; then shift 1