Skip to content

Commit

Permalink
[CI] Use smaller models in unit tests (ModelCloud#72)
Browse files Browse the repository at this point in the history
* replace models in test files

* rename size to GENERATE_EVAL_SIZE

---------

Co-authored-by: zyc-modelcloud <zyc@modelcloud.ai>
  • Loading branch information
ZYC-ModelCloud and ZYC-ModelCloud authored Jun 26, 2024
1 parent 3302646 commit e99f5a5
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 38 deletions.
28 changes: 17 additions & 11 deletions tests/test_q4_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from gptqmodel import GPTQModel # noqa: E402
from transformers import AutoTokenizer # noqa: E402

GENERATE_EVAL_SIZE = 100

def get_diff(a, ref):
eps = 1e-6
Expand Down Expand Up @@ -599,7 +600,7 @@ def test_cuda_old(self, use_half2: bool):
(torch.float16, "cuda:0"),
]
)
def test_generation_with_act_order(self, torch_dtype, device):
def test_generation_desc_act_true(self, torch_dtype, device):
prompt = "I am in Paris and"

# Reference generated with the cuda-old kernel
Expand All @@ -608,13 +609,15 @@ def test_generation_with_act_order(self, torch_dtype, device):
new_tokens = 2
reference_output = "<s> I am in Paris and I am"
else:
reference_output = "<s> I am in Paris and I am so excited to be here. I am here for the first time in my life and I am so grateful for this opportunity. I am here to learn and to grow and to meet new people and to experience new things. I am here to see the Eiffel Tower and to walk along"
reference_output = "<s> I am in Paris and I am in love with you.\n\nScene 2:\n\nThe stage is now set in a Parisian café. The café is filled with people, including a group of friends, a couple, and a group of tourists. The friends are discussing their plans for the"
new_tokens = 60

model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
revision = "desc_act_true"

model_q = GPTQModel.from_quantized(
model_id,
revision=revision,
device=device,
use_triton=False,
disable_exllama=True,
Expand All @@ -629,12 +632,13 @@ def test_generation_with_act_order(self, torch_dtype, device):
# This one uses Autocast.
res = model_q.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
predicted_text = tokenizer.decode(res[0])
self.assertEqual(predicted_text, reference_output)

self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])

# This one does not.
res = model_q.model.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
predicted_text = tokenizer.decode(res[0])
self.assertEqual(predicted_text, reference_output)
self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])

@parameterized.expand(
[
Expand All @@ -643,19 +647,19 @@ def test_generation_with_act_order(self, torch_dtype, device):
(torch.float16, "cuda:0"),
]
)
def test_generation_no_act_order(self, torch_dtype, device):
def test_generation_desc_act_false(self, torch_dtype, device):
prompt = "I am in Paris and"

# Reference generated with the cuda-old kernel
if device == "cpu":
# CPU implementation is extremely slow.
new_tokens = 3
reference_output = "<s> I am in Paris and I am going"
reference_output = "<s> I am in Paris and I am in"
else:
reference_output = "<s> I am in Paris and I am going to the Louvre Museum. What time does it open and what is the best way to get there?\nThe Louvre Museum in Paris is open from 9:00 AM to 6:00 PM every day except for Tuesdays. The best way to get"
reference_output = "<s> I am in Paris and I am in love with you.\n\nScene 2:\n\n(The stage is now dark, but the audience can see the characters walking around the stage.)\n\n(The stage is now lit up, but the audience can see the characters walking around the stage.)\n\n(The"
new_tokens = 60

model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"

model_q = GPTQModel.from_quantized(
model_id,
Expand All @@ -672,9 +676,11 @@ def test_generation_no_act_order(self, torch_dtype, device):
# This one uses Autocast.
res = model_q.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
predicted_text = tokenizer.decode(res[0])
self.assertEqual(predicted_text, reference_output)

self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])

# This one does not.
res = model_q.model.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
predicted_text = tokenizer.decode(res[0])
self.assertEqual(predicted_text, reference_output)

self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])
27 changes: 18 additions & 9 deletions tests/test_q4_exallama.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# -- do not touch
import numpy
import random

import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
Expand Down Expand Up @@ -1050,6 +1053,8 @@
]
).to(torch.float16)

GENERATE_EVAL_SIZE = 100


class TestsQ4Exllama(unittest.TestCase):
def test_exllama(self):
Expand Down Expand Up @@ -1124,10 +1129,12 @@ def test_exllama_buffer_size(self):
prompt = "I am in Paris and" * 450
device = torch.device("cuda:0")

model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
revision = "desc_act_true"

model_q = GPTQModel.from_quantized(
model_id,
revision=revision,
device="cuda:0",
use_triton=False,
disable_exllama=False,
Expand Down Expand Up @@ -1155,14 +1162,14 @@ def test_exllama_buffer_size(self):
_ = model_q.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
self.assertTrue("temp_state buffer is too small" in str(cm.exception))

def test_generation_no_act_order(self):
def test_generation_desc_act_false(self):
prompt = "I am in Paris and"
device = torch.device("cuda:0")

# Reference generated with the cuda-old kernel
reference_output = "<s> I am in Paris and I am going to the Louvre Museum. What time does it open and what is the best way to get there?\nThe Louvre Museum in Paris is open from 9:00 AM to 6:00 PM every day except for Tuesdays. The best way to get"
reference_output = "<s> I am in Paris and I am in love with you.\n\nScene 2:\n\n(The stage is now dark, but the audience can see the characters walking around the stage.)\n\n(The stage is now lit up, but the audience can see the characters walking around the stage.)\n\n(The"

model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
model_q = GPTQModel.from_quantized(
model_id,
device="cuda:0",
Expand All @@ -1178,19 +1185,21 @@ def test_generation_no_act_order(self):

predicted_text = tokenizer.decode(res[0])

self.assertEqual(predicted_text, reference_output)
self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])

def test_generation_with_act_order(self):
def test_generation_desc_act_true(self):
prompt = "I am in Paris and"
device = torch.device("cuda:0")

# Reference generated with the cuda-old kernel
reference_output = "<s> I am in Paris and I am so excited to be here. I am here for the first time in my life and I am so grateful for this opportunity. I am here to learn and to grow and to meet new people and to experience new things. I am here to see the Eiffel Tower and to walk along"
reference_output = "<s> I am in Paris and I am in love with you.\n\nScene 2:\n\nThe stage is now set in a Parisian café. The café is filled with people, including a group of friends, a couple, and a group of tourists. The friends are discussing their plans for the"

model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
revision = "desc_act_true"

model_q = GPTQModel.from_quantized(
model_id,
revision=revision,
device="cuda:0",
use_triton=False,
disable_exllama=False,
Expand All @@ -1204,7 +1213,7 @@ def test_generation_with_act_order(self):

predicted_text = tokenizer.decode(res[0])

self.assertEqual(predicted_text, reference_output)
self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])

def test_multigpu(self):
# TODO
Expand Down
23 changes: 14 additions & 9 deletions tests/test_q4_exallama_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from test_q4_cuda import get_diff # noqa: E402
from transformers import AutoTokenizer # noqa: E402

GENERATE_EVAL_SIZE = 100

class TestsQ4ExllamaV2(unittest.TestCase):
def test_exllamav2(self):
Expand Down Expand Up @@ -74,14 +75,14 @@ def test_exllamav2(self):
get_diff(res, reference),
)

def test_generation_no_act_order(self):
def test_generation_desc_act_false(self):
prompt = "I am in Paris and"
device = torch.device("cuda:0")

# Reference generated with the cuda-old kernel
reference_output = "<s> I am in Paris and I am going to the Louvre Museum. What time does it open and what is the best way to get there?\nThe Louvre Museum in Paris is open from 9:00 AM to 6:00 PM every day except for Tuesdays. The best way to get"
reference_output = "<s> I am in Paris and I am in love with you.\n\nScene 2:\n\n(The stage is now dark, but the audience can see the characters walking around the stage.)\n\n(The stage is now lit up, but the audience can only see the characters' silhouettes.)\n\n("

model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"

model_q = GPTQModel.from_quantized(model_id, device="cuda:0", use_triton=False)
tokenizer = AutoTokenizer.from_pretrained(model_id)
Expand All @@ -92,19 +93,21 @@ def test_generation_no_act_order(self):

predicted_text = tokenizer.decode(res[0])

self.assertEqual(predicted_text, reference_output)
self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])

def test_generation_with_act_order(self):
def test_generation_desc_act_true(self):
prompt = "I am in Paris and"
device = torch.device("cuda:0")

# Reference generated with the cuda-old kernel
reference_output = "<s> I am in Paris and I am so excited to be here. I am here for the first time in my life and I am so grateful for this opportunity. I am here to learn and to grow and to meet new people and to experience new things. I am here to see the Eiffel Tower and to walk along"
reference_output = "<s> I am in Paris and I am in love with you.\n\nScene 2:\n\n(The stage is now dark, but the audience can see the characters walking around the stage.)\n\n(The stage is now lit up, but the audience can see the characters walking around the stage.)\n\n(The"

model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
revision = "desc_act_true"

model_q = GPTQModel.from_quantized(
model_id,
rivision=revision,
device="cuda:0",
use_triton=False,
)
Expand All @@ -116,17 +119,19 @@ def test_generation_with_act_order(self):

predicted_text = tokenizer.decode(res[0])

self.assertEqual(predicted_text, reference_output)
self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])

def test_exllama_v2_buffer_size(self):
# prompt = "I'm in Paris and" * 450
prompt = "I'm in Paris and" * 500
device = torch.device("cuda:0")

model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
revision = "desc_act_true"

model_q = GPTQModel.from_quantized(
model_id,
revision=revision,
device="cuda:0",
use_triton=False,
)
Expand Down
23 changes: 14 additions & 9 deletions tests/test_q4_triton.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,16 @@
from gptqmodel import GPTQModel # noqa: E402
from transformers import AutoTokenizer # noqa: E402

GENERATE_EVAL_SIZE = 100

class TestsQ4Triton(unittest.TestCase):
def test_generation_no_act_order(self):
def test_generation_desc_act_false(self):
prompt = "I am in Paris and"

reference_output = "<s> I am in Paris and I am going to the Louvre Museum. What time does it open and what is the best way to get there?\nThe Louvre Museum in Paris is open from 9:00 AM to 6:00 PM every day except for Tuesdays. The best way to get"
reference_output = "<s> I am in Paris and I am in love with you.\n\nScene 2:\n\n(The stage is now dark, but the audience can see the characters walking around the stage.)\n\n(The stage is now lit up, but the audience can only see the characters' silhouettes.)\n\n("
new_tokens = 60

model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"

model_q = GPTQModel.from_quantized(
model_id,
Expand All @@ -48,25 +49,29 @@ def test_generation_no_act_order(self):
# This one uses Autocast.
res = model_q.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
predicted_text = tokenizer.decode(res[0])
self.assertEqual(predicted_text, reference_output)

self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])

# This one does not.
res = model_q.model.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
predicted_text = tokenizer.decode(res[0])
self.assertEqual(predicted_text, reference_output)

def test_generation_with_act_order(self):
self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])

def test_generation_desc_act_true(self):
prompt = "I am in Paris and"
device = torch.device("cuda:0")

# Reference generated with the cuda-old kernel
reference_output = "<s> I am in Paris and I am so excited to be here. I am here for the first time in my life and I am so grateful for this opportunity. I am here to learn and to grow and to meet new people and to experience new things. I am here to see the Eiffel Tower and to walk along"
reference_output = "<s> I am in Paris and I am in love with you.\n\nScene 2:\n\nThe stage is now set in a Parisian café. The café is filled with people, including a group of friends, a couple, and a group of tourists. The friends are discussing their plans for the"

model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
revision = "desc_act_true"

model_q = GPTQModel.from_quantized(
model_id,
device="cuda:0",
revision=revision,
use_triton=True,
disable_exllama=True,
disable_exllamav2=True,
Expand All @@ -85,4 +90,4 @@ def test_generation_with_act_order(self):

predicted_text = tokenizer.decode(res[0])

self.assertEqual(predicted_text, reference_output)
self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])

0 comments on commit e99f5a5

Please sign in to comment.