From 77b20e2524cbf45ca4e339d48a62b711ca450392 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Wed, 10 Apr 2024 18:19:12 -0400 Subject: [PATCH] chore: update vllm to use gptq quanitzed model (#378) * chore: update vllm to use gptq quanitzed model * bug: fix catch-all wildcard for e2e workflow Signed-off-by: Andrew Risse --- .github/workflows/e2e.yaml | 2 +- packages/vllm/Dockerfile | 6 +++--- packages/vllm/main.py | 3 +++ packages/vllm/scripts/model_download.py | 7 ++++--- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 24cf083d69..298626e031 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -3,7 +3,7 @@ on: pull_request: paths: # Catch-all - - "*" + - "**" # Ignore updates to the .github directory, unless it's this current file - "!.github/**" diff --git a/packages/vllm/Dockerfile b/packages/vllm/Dockerfile index 3f7ba07ee3..a11404c122 100755 --- a/packages/vllm/Dockerfile +++ b/packages/vllm/Dockerfile @@ -36,14 +36,14 @@ COPY build/leapfrogai_api*.whl leapfrogai_api-100.100.100-py3-none-any.whl RUN pip install "leapfrogai_api-100.100.100-py3-none-any.whl[vllm]" --no-index --find-links=build/ # download model -ARG REPO_ID=TheBloke/Synthia-7B-v2.0-AWQ -ARG REVISION=main +ARG REPO_ID=TheBloke/Synthia-7B-v2.0-GPTQ +ARG REVISION=gptq-4bit-32g-actorder_True ENV HF_HOME=/home/leapfrogai/.cache/huggingface COPY scripts/model_download.py scripts/model_download.py RUN REPO_ID=${REPO_ID} FILENAME=${FILENAME} REVISION=${REVISION} python3.11 scripts/model_download.py -ENV QUANTIZATION=awq +ENV QUANTIZATION=gptq COPY main.py . COPY config.yaml . diff --git a/packages/vllm/main.py b/packages/vllm/main.py index 68f2273e7d..0bb33dc7e4 100644 --- a/packages/vllm/main.py +++ b/packages/vllm/main.py @@ -93,6 +93,9 @@ def __init__(self): quantization=os.environ["QUANTIZATION"] or None, max_context_len_to_capture=self.backend_config.max_context_length, worker_use_ray=True, + max_model_len=self.backend_config.max_context_length, + dtype="auto", + gpu_memory_utilization=0.90, ) self.engine = AsyncLLMEngine.from_engine_args(self.engine_args) diff --git a/packages/vllm/scripts/model_download.py b/packages/vllm/scripts/model_download.py index 36c8c7d641..4ae952e838 100644 --- a/packages/vllm/scripts/model_download.py +++ b/packages/vllm/scripts/model_download.py @@ -1,8 +1,9 @@ -from huggingface_hub import snapshot_download import os -REPO_ID = os.environ.get("REPO_ID", "TheBloke/Synthia-7B-v2.0-AWQ") -REVISION = os.environ.get("REVISION", "main") +from huggingface_hub import snapshot_download + +REPO_ID = os.environ.get("REPO_ID", "TheBloke/Synthia-7B-v2.0-GPTQ") +REVISION = os.environ.get("REVISION", "gptq-4bit-32g-actorder_True") os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"