Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support to export gguf q4_0 and q4_1 format #393

Merged
merged 30 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
8355347
export gguf
n1ck-guo Dec 24, 2024
dd55003
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 24, 2024
f67219b
q4_0/1 port c++ to python
n1ck-guo Dec 24, 2024
611c4c1
Merge branch 'hengguo/gguf' of https://github.com/intel/auto-round in…
n1ck-guo Dec 24, 2024
ce1c48e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 24, 2024
7ab730b
change to llama.cpp stype and add uint8 store
n1ck-guo Dec 25, 2024
287b5af
abstract
n1ck-guo Dec 25, 2024
49d95a8
merge
n1ck-guo Dec 25, 2024
113532a
update
n1ck-guo Dec 26, 2024
ee66c47
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 26, 2024
d395c6b
fix
n1ck-guo Dec 26, 2024
8b13f1f
Merge branch 'hengguo/gguf' of https://github.com/intel/auto-round in…
n1ck-guo Dec 26, 2024
ce2c346
update
n1ck-guo Dec 30, 2024
8bceb3f
default sequence eval
n1ck-guo Dec 30, 2024
722a1d8
modify by comments
n1ck-guo Dec 30, 2024
8712170
update
n1ck-guo Dec 30, 2024
1aa979a
pylint
n1ck-guo Dec 30, 2024
515160d
clean
n1ck-guo Dec 30, 2024
a064c44
pylint
n1ck-guo Dec 30, 2024
fa2328d
fix
n1ck-guo Dec 30, 2024
7906284
update
n1ck-guo Dec 31, 2024
4261191
Merge branch 'main' into hengguo/gguf
n1ck-guo Dec 31, 2024
e525f97
add ut
n1ck-guo Dec 31, 2024
b0f96a0
add cuda ut
n1ck-guo Dec 31, 2024
c7ec3a5
add requirements
n1ck-guo Dec 31, 2024
79c5c5a
format
n1ck-guo Dec 31, 2024
2720287
code scane
n1ck-guo Dec 31, 2024
db15354
update
n1ck-guo Jan 7, 2025
24a68a9
merge main
n1ck-guo Jan 7, 2025
cb67c1a
update
n1ck-guo Jan 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions auto_round/autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -1267,6 +1267,14 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
if processor is not None:
processor.save_pretrained(output_dir)
return
if format in ["gguf:q4_0", "gguf:q4_1"]:
if self.group_size != 32:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also better check bits

logger.error(f"{format} need group_size=32, but it is {self.group_size}, cannot export.")
return
if format == "gguf:q4_0" and not self.sym:
logger.warning(f"incorrect format choose, will reset to gguf:q4_1")
if format == "gguf:q4_1" and self.sym:
logger.warning(f"incorrect format choose, will reset to gguf:q4_0")

from auto_round.export import EXPORT_FORMAT
backend = format
Expand Down
2 changes: 1 addition & 1 deletion auto_round/export/export_to_gguf/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ def prepare_tensors(self):
# n_dims is implicit in the shape
logger.info(
f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype}"
" --> {data_qtype.name}, shape = {shape_str}")
f" --> {data_qtype.name}, shape = {shape_str}")

self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)

Expand Down
2 changes: 1 addition & 1 deletion auto_round/export/export_to_gguf/quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def q4_0_quant_block(blocks: np.array, scale = None, zp = None):
def q4_1_quant_block(blocks: np.array, scale = None, zp = None):
if scale is not None:
d = scale.reshape((-1,1))
min = zp.reshape((-1,1))
min = zp.reshape((-1,1)) * d * -1
else:
max = blocks.max(axis=-1, keepdims=True)
min = blocks.min(axis=-1, keepdims=True)
Expand Down
25 changes: 14 additions & 11 deletions auto_round/script/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,23 +269,26 @@ def tune(args):
if args.format is None:
args.format = "auto_round"
supported_formats = ["auto_round", "auto_gptq", "auto_awq", "auto_round:auto_gptq", "auto_round:auto_awq",
"auto_gptq:marlin", "gguf:q4_0", "gguf:q4_1", "itrex", "iterx_xpu", "fake"]
"auto_gptq:marlin", "gguf:q4_0", "gguf:q4_1", "itrex", "itrex_xpu", "fake"]
formats = args.format.lower().replace(' ', '').split(",")
for format in formats:
if format not in supported_formats:
raise ValueError(f"{format} is not supported, we only support {supported_formats}")
if format in ["gguf:q4_0", "gguf:q4_1"]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

support gguf later if we could inference the exact type by the quantization config

args.bits = 4
if args.act_bits <= 8:
logger.warning(f"{args.format} not support for activation quantization.")
if args.group_size != 32:
logger.warning(f"{args.format} not support for group_size: {args.group_size}. "
"Reset group_size to 32.")
args.group_size = 32
if args.format.endswith("_0"):
args.asym = False
if args.format.endswith("_1"):
args.asym = True
if args.act_bits <= 8:
logger.warning(f"{args.format} not support for activation quantization.")
if args.group_size != 32:
logger.warning(f"{args.format} not support for group_size: {args.group_size}. "
"Reset group_size to 32.")
args.group_size = 32
if args.format.endswith("_0") and args.asym:
logger.warning(f"{args.format} not support for asymmetric quantization, will reset to sym.")
args.asym = False
if args.format.endswith("_1") and not args.asym:
logger.warning(f"{args.format} not support for symmetric quantization, will reset to asym.")
args.asym = True
logger.info(f"export format {format}, sym = {not args.asym}, group_size = {args.group_size}")

if "auto_gptq" in args.format and args.asym is True:
print(
Expand Down
Loading