From 397e880ee467f7badd8ebc467fcef8b24177da5e Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Wed, 12 Feb 2025 17:00:23 -0500 Subject: [PATCH] Add performance benchmark config: MPS 8da4w --- .ci/scripts/gather_benchmark_configs.py | 1 + .github/workflows/apple-perf.yml | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/.ci/scripts/gather_benchmark_configs.py b/.ci/scripts/gather_benchmark_configs.py index 0fe60a0d77..c15a64e3ed 100755 --- a/.ci/scripts/gather_benchmark_configs.py +++ b/.ci/scripts/gather_benchmark_configs.py @@ -43,6 +43,7 @@ "coreml_fp16", "mps", "llama3_coreml_ane", + "llama3_mps_8da4w", ], } diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index ea88be441c..83f03323e9 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -298,6 +298,23 @@ jobs: --coreml-compute-units cpu_and_ne \ --output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" + elif [[ ${{ matrix.config }} == "llama3_mps_8da4w" ]]; then + # MPS 8da4w + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") + ${CONDA_RUN} python -m examples.models.llama.export_llama \ + --model "llama3_2" \ + --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ + --params "${DOWNLOADED_PATH}/params.json" \ + -kv \ + --use_sdpa_with_kv_cache \ + --disable_dynamic_shape \ + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ + --mps \ + -qmode 8da4w \ + --group_size 32 \ + --embedding-quantize 4,32 \ + --output_name="${OUT_ET_MODEL_NAME}.pte" + ls -lh "${OUT_ET_MODEL_NAME}.pte" else # By default, test with the Hugging Face model and the xnnpack recipe DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")