feat: add int8 quantizing script (#132)

Co-authored-by: Avram Tudor <tudor.avram@8x8.com>
jitsi · Dec 12, 2024 · 25eeda3 · 25eeda3
1 parent bf96344
commit 25eeda3
Show file tree

Hide file tree

Showing 5 changed files with 168 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ It is comprised of specialized modules which can be enabled or disabled as neede
 
 ```bash
 # if VLLM cannot be used, make sure to have Ollama started. In that case LLAMA_PATH should be the model name, like "llama3.1".
-export LLAMA_PATH="$HOME/models/Llama-3.1-8B-Instruct-Q8_0.gguf"
+export LLAMA_PATH="$HOME/models/Llama-3.1-8B-Instruct"
 
 # disable authorization (for testing)
 export BYPASS_AUTHORIZATION=1

diff --git a/lint.sh b/lint.sh
@@ -1,4 +1,4 @@
 #!/bin/sh
 
-poetry run black skynet
-poetry run usort format skynet
+poetry run black skynet tools
+poetry run usort format skynet tools
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,7 @@ poetry-plugin-export = "^1.6.0"
 usort = "^1.0.7"
 argparse = "^1.4.0"
 tqdm = "^4.66.5"
+llmcompressor = "^0.3.0"
 
 [tool.poetry.group.test.dependencies]
 pytest = "7.4.4"

diff --git a/tools/quantize.py b/tools/quantize.py
@@ -0,0 +1,60 @@
+# Courtesy of https://docs.vllm.ai/en/stable/quantization/int8.html
+
+from datasets import load_dataset
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+from llmcompressor.transformers import oneshot, SparseAutoModelForCausalLM
+from transformers import AutoTokenizer
+
+from skynet.env import llama_path as MODEL_ID
+
+model = SparseAutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load and preprocess the dataset
+ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+
+def preprocess(example):
+    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+
+
+ds = ds.map(preprocess)
+
+
+def tokenize(sample):
+    return tokenizer(
+        sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+
+# Configure the quantization algorithms
+recipe = [
+    SmoothQuantModifier(smoothing_strength=0.8),
+    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
+]
+
+# Apply quantization
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Save the compressed model
+SAVE_DIR = MODEL_ID + "-W8A8-Dynamic-Per-Token"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)