Skip to content

Commit

Permalink
neo quantization script fixes
Browse files Browse the repository at this point in the history
- for fp8, do dataset prep and tokenization on our side rather than
  through llmcompressor. prevents timeout.
- use lmi-dist venv for awq, due to incompatible hf transformers version
  • Loading branch information
a-ys committed Feb 5, 2025
1 parent 7d003c8 commit 7596018
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 9 deletions.
32 changes: 24 additions & 8 deletions serving/docker/partition/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

from properties_manager import PropertiesManager
from huggingface_hub import snapshot_download
from datasets import load_dataset

from utils import (get_partition_cmd, extract_python_jar,
get_python_executable, get_download_dir,
Expand Down Expand Up @@ -295,18 +296,33 @@ def fp8_quantize(self):
"model": model,
"recipe": recipe,
}
if "dynamic" in recipe.scheme:
pass
else:
oneshot_kwargs["dataset"] = "cnn_dailymail"
oneshot_kwargs["num_calibration_samples"] = int(
self.properties.get("option.calib_size", 512))
oneshot_kwargs["max_seq_length"] = int(

# no dataset necessary if using dynamic activation scales
if "dynamic" not in recipe.scheme.lower():
calib_size = int(self.properties.get("option.calib_size", 512))
max_seq_length = int(
self.properties.get("option.max_model_len", 2048))

ds = load_dataset("abisee/cnn_dailymail",
"3.0.0",
split="validation")
ds = ds.shuffle(seed=42).select(range(calib_size))

def tokenize(sample):
return tokenizer(sample["article"],
padding=False,
truncation=True,
max_length=max_seq_length)

ds = ds.map(tokenize,
remove_columns=ds.column_names,
desc="Tokenizing calibration samples")
oneshot_kwargs["max_seq_length"] = max_seq_length
oneshot_kwargs["num_calibration_samples"] = calib_size
logging.info(
f"Using the following configuartions for fp8 quantization: {oneshot_kwargs}"
f"Using the following options for fp8 quantization: {oneshot_kwargs}"
)
oneshot_kwargs["dataset"] = ds
oneshot(**oneshot_kwargs)
logging.info(f"Quantization complete. Saving model to: {output_path}")
model.save_pretrained(output_path)
Expand Down
9 changes: 8 additions & 1 deletion serving/docker/partition/sm_neo_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,14 @@ def dispatch(self):
print("Sharding Model...")
self.run_task(NeoTask.SHARDING, python_exec)
else:
self.run_task(NeoTask.QUANTIZATION, VLLM_VENV_EXEC)
if self.properties.get("option.quantize",
"").lower() == "fp8":
python_exec = VLLM_VENV_EXEC
else:
# run awq quantization with lmi-dist venv b/c AutoAWQ
# is incompatible with newer transformers
python_exec = LMI_DIST_VENV_EXEC
self.run_task(NeoTask.QUANTIZATION, python_exec)
case "trtllm":
self.run_task(NeoTask.TENSORRT_LLM, SYSTEM_PY_EXEC)
case "vllm,lmi-dist,tnx":
Expand Down

0 comments on commit 7596018

Please sign in to comment.