Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix bug of evaluate user model #444

Merged
merged 3 commits into from
Feb 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions auto_round/script/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,13 @@
import argparse
import sys

from auto_round.utils import get_fp_layer_names, set_cuda_visible_devices, clear_memory, is_debug_mode

from auto_round.utils import (
get_fp_layer_names,
clear_memory,
is_debug_mode,
get_device_and_parallelism,
set_cuda_visible_devices)

os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

Expand Down Expand Up @@ -338,7 +344,8 @@ def tune(args):
assert False, "marlin backend only supports sym quantization, please remove --asym"

##must set this before import torch
device_str, use_auto_mapping = set_cuda_visible_devices(args.device)
set_cuda_visible_devices(args.device)
device_str, use_auto_mapping = get_device_and_parallelism(args.device)

import torch
if not args.disable_deterministic_algorithms:
Expand Down Expand Up @@ -558,7 +565,8 @@ def tune(args):
dispatch_model(model, model.hf_device_map)
user_model = model
else:
user_model = model # .to(device_str)
device_str = detect_device(device_str)
user_model = model.to(device_str)

if args.eval_bs is None or args.eval_bs == "auto":
args.eval_bs = 16
Expand All @@ -573,7 +581,8 @@ def tune(args):


def _eval_init(tasks, model_path, device, disable_trust_remote_code=False):
device_str, parallelism = set_cuda_visible_devices(device)
set_cuda_visible_devices(device)
device_str, parallelism = get_device_and_parallelism(device)
##model_args = f'pretrained={model_path},trust_remote_code={not disable_trust_remote_code},add_bos_token=True'
model_args = f'pretrained={model_path},trust_remote_code={not disable_trust_remote_code}'
if parallelism:
Expand Down
20 changes: 15 additions & 5 deletions auto_round/script/mllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,14 @@
import sys
import argparse

from auto_round.utils import clear_memory, get_fp_layer_names, set_cuda_visible_devices, logger, is_debug_mode
from auto_round.utils import (
get_fp_layer_names,
clear_memory,
is_debug_mode,
get_device_and_parallelism,
set_cuda_visible_devices,
logger)


os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

Expand Down Expand Up @@ -276,7 +283,6 @@ def tune(args):
import transformers

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, AutoProcessor
from lm_eval.utils import make_table # pylint: disable=E0401

if args.format is None:
args.format = "auto_round"
Expand All @@ -290,7 +296,9 @@ def tune(args):
raise ValueError(f"{format} is not supported, we only support {supported_formats}")

##must set this before import torch
device_str, use_auto_mapping = set_cuda_visible_devices(args.device)
set_cuda_visible_devices(args.device)
device_str, use_auto_mapping = get_device_and_parallelism(args.device)


import torch
if not args.disable_deterministic_algorithms:
Expand Down Expand Up @@ -495,7 +503,8 @@ def tune(args):


def eval(args):
device_str, parallelism = set_cuda_visible_devices(args.device)
set_cuda_visible_devices(args.device)
device_str, parallelism = get_device_and_parallelism(args.device)
if parallelism:
os.environ['AUTO_SPLIT'] = '1'
if isinstance(args.tasks, str):
Expand Down Expand Up @@ -570,7 +579,8 @@ def setup_lmms_parser():


def lmms_eval(args):
device_str, parallelism = set_cuda_visible_devices(args.device)
set_cuda_visible_devices(args.device)
device_str, parallelism = get_device_and_parallelism(args.device)

from auto_round.mllm import lmms_eval

Expand Down
25 changes: 13 additions & 12 deletions auto_round/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1160,10 +1160,22 @@ def check_awq_gemm_compatibility(model, bits, group_size, sym, layer_configs=Non

return True, ""

def get_device_and_parallelism(device):
from auto_round.utils import detect_device
devices = device.replace(" ", "").split(',')
if all(s.isdigit() for s in devices) and len(devices) > 1:
device = "cuda"
parallelism = True
elif device == "auto":
device = detect_device(device)
parallelism = True
else:
device = detect_device(device)
parallelism = False
return device, parallelism

def set_cuda_visible_devices(device):
devices = device.replace(" ", "").split(',')
parallelism = False
if all(s.isdigit() for s in devices):
if "CUDA_VISIBLE_DEVICES" in os.environ:
current_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"]
Expand All @@ -1180,17 +1192,6 @@ def set_cuda_visible_devices(device):
os.environ["CUDA_VISIBLE_DEVICES"] = visible_devices
else:
os.environ["CUDA_VISIBLE_DEVICES"] = device
device = ",".join(map(str, range(len(devices))))
devices = device.replace(" ", "").split(',')
if len(devices) > 1: ##for 70B model on single card, use auto will cause some layer offload to cpu
parallelism = True
device_str = None
elif device == "auto":
device_str = None
parallelism = True
else:
device_str = detect_device(device.replace(" ", ""))
return device_str, parallelism


def is_debug_mode():
Expand Down