Skip to content

Commit

Permalink
Add quantization support for TGI (#757)
Browse files Browse the repository at this point in the history
* Add quantization support for TGI

* Fix formatting

* Move comment into description
  • Loading branch information
achandrasekar authored Jul 30, 2024
1 parent fdef210 commit 464a071
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ resource "kubernetes_manifest" "default" {
model_id = var.model_id
gpu_count = var.gpu_count
max_concurrent_requests = var.max_concurrent_requests
quantization = var.quantization
ksa = var.ksa
hugging_face_token_secret_list = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret]
}))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,18 @@ spec:
- containerPort: 80
image: "ghcr.io/huggingface/text-generation-inference:1.4.2"
args: ["--model-id", "${model_id}", "--num-shard", "${gpu_count}", "--max-concurrent-requests", "${max_concurrent_requests}"]
%{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
env:
%{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
- name: HUGGING_FACE_HUB_TOKEN # Related token consumption
valueFrom:
secretKeyRef:
name: hf-token
key: HF_TOKEN
%{ endfor ~}
%{ if quantization != "" ~}
- name: QUANTIZE
value: "${quantization}"
%{ endif ~}
resources:
limits:
nvidia.com/gpu: ${gpu_count} # number of gpu's allocated to workload
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,13 @@ variable "max_concurrent_requests" {
}
}

variable "quantization" {
description = "Quantization used for the model. Can be one of the quantization options mentioned in https://huggingface.co/docs/text-generation-inference/en/basic_tutorials/launcher#quantize. `eetq` and `bitsandbytes` can be applied to any models whereas others might require the use of quantized checkpoints."
type = string
nullable = true
default = ""
}

variable "ksa" {
description = "Kubernetes Service Account used for workload."
type = string
Expand Down

0 comments on commit 464a071

Please sign in to comment.