[CI] Use smaller models in unit tests (ModelCloud#72)

* replace models in test files * rename size to GENERATE_EVAL_SIZE --------- Co-authored-by: zyc-modelcloud <zyc@modelcloud.ai>
DeJoker · Jun 26, 2024 · e99f5a5 · e99f5a5
1 parent 3302646
commit e99f5a5
Show file tree

Hide file tree

Showing 4 changed files with 63 additions and 38 deletions.
diff --git a/tests/test_q4_cuda.py b/tests/test_q4_cuda.py
@@ -18,6 +18,7 @@
 from gptqmodel import GPTQModel  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+GENERATE_EVAL_SIZE = 100
 
 def get_diff(a, ref):
     eps = 1e-6
@@ -599,7 +600,7 @@ def test_cuda_old(self, use_half2: bool):
             (torch.float16, "cuda:0"),
         ]
     )
-    def test_generation_with_act_order(self, torch_dtype, device):
+    def test_generation_desc_act_true(self, torch_dtype, device):
         prompt = "I am in Paris and"
 
         # Reference generated with the cuda-old kernel
@@ -608,13 +609,15 @@ def test_generation_with_act_order(self, torch_dtype, device):
             new_tokens = 2
             reference_output = "<s> I am in Paris and I am"
         else:
-            reference_output = "<s> I am in Paris and I am so excited to be here. I am here for the first time in my life and I am so grateful for this opportunity. I am here to learn and to grow and to meet new people and to experience new things. I am here to see the Eiffel Tower and to walk along"
+            reference_output = "<s> I am in Paris and I am in love with you.\n\nScene 2:\n\nThe stage is now set in a Parisian café. The café is filled with people, including a group of friends, a couple, and a group of tourists. The friends are discussing their plans for the"
             new_tokens = 60
 
-        model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
+        model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
+        revision = "desc_act_true"
 
         model_q = GPTQModel.from_quantized(
             model_id,
+            revision=revision,
             device=device,
             use_triton=False,
             disable_exllama=True,
@@ -629,12 +632,13 @@ def test_generation_with_act_order(self, torch_dtype, device):
         # This one uses Autocast.
         res = model_q.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
         predicted_text = tokenizer.decode(res[0])
-        self.assertEqual(predicted_text, reference_output)
+
+        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])
 
         # This one does not.
         res = model_q.model.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
         predicted_text = tokenizer.decode(res[0])
-        self.assertEqual(predicted_text, reference_output)
+        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])
 
     @parameterized.expand(
         [
@@ -643,19 +647,19 @@ def test_generation_with_act_order(self, torch_dtype, device):
             (torch.float16, "cuda:0"),
         ]
     )
-    def test_generation_no_act_order(self, torch_dtype, device):
+    def test_generation_desc_act_false(self, torch_dtype, device):
         prompt = "I am in Paris and"
 
         # Reference generated with the cuda-old kernel
         if device == "cpu":
             # CPU implementation is extremely slow.
             new_tokens = 3
-            reference_output = "<s> I am in Paris and I am going"
+            reference_output = "<s> I am in Paris and I am in"
         else:
-            reference_output = "<s> I am in Paris and I am going to the Louvre Museum. What time does it open and what is the best way to get there?\nThe Louvre Museum in Paris is open from 9:00 AM to 6:00 PM every day except for Tuesdays. The best way to get"
+            reference_output = "<s> I am in Paris and I am in love with you.\n\nScene 2:\n\n(The stage is now dark, but the audience can see the characters walking around the stage.)\n\n(The stage is now lit up, but the audience can see the characters walking around the stage.)\n\n(The"
             new_tokens = 60
 
-        model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
+        model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
 
         model_q = GPTQModel.from_quantized(
             model_id,
@@ -672,9 +676,11 @@ def test_generation_no_act_order(self, torch_dtype, device):
         # This one uses Autocast.
         res = model_q.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
         predicted_text = tokenizer.decode(res[0])
-        self.assertEqual(predicted_text, reference_output)
+
+        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])
 
         # This one does not.
         res = model_q.model.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
         predicted_text = tokenizer.decode(res[0])
-        self.assertEqual(predicted_text, reference_output)
+
+        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])
diff --git a/tests/test_q4_exallama.py b/tests/test_q4_exallama.py
@@ -1,4 +1,7 @@
 # -- do not touch
+import numpy
+import random
+
 import os
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
@@ -1050,6 +1053,8 @@
     ]
 ).to(torch.float16)
 
+GENERATE_EVAL_SIZE = 100
+
 
 class TestsQ4Exllama(unittest.TestCase):
     def test_exllama(self):
@@ -1124,10 +1129,12 @@ def test_exllama_buffer_size(self):
         prompt = "I am in Paris and" * 450
         device = torch.device("cuda:0")
 
-        model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
+        model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
+        revision = "desc_act_true"
 
         model_q = GPTQModel.from_quantized(
             model_id,
+            revision=revision,
             device="cuda:0",
             use_triton=False,
             disable_exllama=False,
@@ -1155,14 +1162,14 @@ def test_exllama_buffer_size(self):
             _ = model_q.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
         self.assertTrue("temp_state buffer is too small" in str(cm.exception))
 
-    def test_generation_no_act_order(self):
+    def test_generation_desc_act_false(self):
         prompt = "I am in Paris and"
         device = torch.device("cuda:0")
 
         # Reference generated with the cuda-old kernel
-        reference_output = "<s> I am in Paris and I am going to the Louvre Museum. What time does it open and what is the best way to get there?\nThe Louvre Museum in Paris is open from 9:00 AM to 6:00 PM every day except for Tuesdays. The best way to get"
+        reference_output = "<s> I am in Paris and I am in love with you.\n\nScene 2:\n\n(The stage is now dark, but the audience can see the characters walking around the stage.)\n\n(The stage is now lit up, but the audience can see the characters walking around the stage.)\n\n(The"
 
-        model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
+        model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
         model_q = GPTQModel.from_quantized(
             model_id,
             device="cuda:0",
@@ -1178,19 +1185,21 @@ def test_generation_no_act_order(self):
 
         predicted_text = tokenizer.decode(res[0])
 
-        self.assertEqual(predicted_text, reference_output)
+        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])
 
-    def test_generation_with_act_order(self):
+    def test_generation_desc_act_true(self):
         prompt = "I am in Paris and"
         device = torch.device("cuda:0")
 
         # Reference generated with the cuda-old kernel
-        reference_output = "<s> I am in Paris and I am so excited to be here. I am here for the first time in my life and I am so grateful for this opportunity. I am here to learn and to grow and to meet new people and to experience new things. I am here to see the Eiffel Tower and to walk along"
+        reference_output = "<s> I am in Paris and I am in love with you.\n\nScene 2:\n\nThe stage is now set in a Parisian café. The café is filled with people, including a group of friends, a couple, and a group of tourists. The friends are discussing their plans for the"
 
-        model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
+        model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
+        revision = "desc_act_true"
 
         model_q = GPTQModel.from_quantized(
             model_id,
+            revision=revision,
             device="cuda:0",
             use_triton=False,
             disable_exllama=False,
@@ -1204,7 +1213,7 @@ def test_generation_with_act_order(self):
 
         predicted_text = tokenizer.decode(res[0])
 
-        self.assertEqual(predicted_text, reference_output)
+        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])
 
     def test_multigpu(self):
         # TODO

diff --git a/tests/test_q4_exallama_v2.py b/tests/test_q4_exallama_v2.py
@@ -21,6 +21,7 @@
 from test_q4_cuda import get_diff  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+GENERATE_EVAL_SIZE = 100
 
 class TestsQ4ExllamaV2(unittest.TestCase):
     def test_exllamav2(self):
@@ -74,14 +75,14 @@ def test_exllamav2(self):
             get_diff(res, reference),
         )
 
-    def test_generation_no_act_order(self):
+    def test_generation_desc_act_false(self):
         prompt = "I am in Paris and"
         device = torch.device("cuda:0")
 
         # Reference generated with the cuda-old kernel
-        reference_output = "<s> I am in Paris and I am going to the Louvre Museum. What time does it open and what is the best way to get there?\nThe Louvre Museum in Paris is open from 9:00 AM to 6:00 PM every day except for Tuesdays. The best way to get"
+        reference_output = "<s> I am in Paris and I am in love with you.\n\nScene 2:\n\n(The stage is now dark, but the audience can see the characters walking around the stage.)\n\n(The stage is now lit up, but the audience can only see the characters' silhouettes.)\n\n("
 
-        model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
+        model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
 
         model_q = GPTQModel.from_quantized(model_id, device="cuda:0", use_triton=False)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -92,19 +93,21 @@ def test_generation_no_act_order(self):
 
         predicted_text = tokenizer.decode(res[0])
 
-        self.assertEqual(predicted_text, reference_output)
+        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])
 
-    def test_generation_with_act_order(self):
+    def test_generation_desc_act_true(self):
         prompt = "I am in Paris and"
         device = torch.device("cuda:0")
 
         # Reference generated with the cuda-old kernel
-        reference_output = "<s> I am in Paris and I am so excited to be here. I am here for the first time in my life and I am so grateful for this opportunity. I am here to learn and to grow and to meet new people and to experience new things. I am here to see the Eiffel Tower and to walk along"
+        reference_output = "<s> I am in Paris and I am in love with you.\n\nScene 2:\n\n(The stage is now dark, but the audience can see the characters walking around the stage.)\n\n(The stage is now lit up, but the audience can see the characters walking around the stage.)\n\n(The"
 
-        model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
+        model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
+        revision = "desc_act_true"
 
         model_q = GPTQModel.from_quantized(
             model_id,
+            rivision=revision,
             device="cuda:0",
             use_triton=False,
         )
@@ -116,17 +119,19 @@ def test_generation_with_act_order(self):
 
         predicted_text = tokenizer.decode(res[0])
 
-        self.assertEqual(predicted_text, reference_output)
+        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])
 
     def test_exllama_v2_buffer_size(self):
         # prompt = "I'm in Paris and" * 450
         prompt = "I'm in Paris and" * 500
         device = torch.device("cuda:0")
 
-        model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
+        model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
+        revision = "desc_act_true"
 
         model_q = GPTQModel.from_quantized(
             model_id,
+            revision=revision,
             device="cuda:0",
             use_triton=False,
         )

diff --git a/tests/test_q4_triton.py b/tests/test_q4_triton.py
@@ -17,15 +17,16 @@
 from gptqmodel import GPTQModel  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+GENERATE_EVAL_SIZE = 100
 
 class TestsQ4Triton(unittest.TestCase):
-    def test_generation_no_act_order(self):
+    def test_generation_desc_act_false(self):
         prompt = "I am in Paris and"
 
-        reference_output = "<s> I am in Paris and I am going to the Louvre Museum. What time does it open and what is the best way to get there?\nThe Louvre Museum in Paris is open from 9:00 AM to 6:00 PM every day except for Tuesdays. The best way to get"
+        reference_output = "<s> I am in Paris and I am in love with you.\n\nScene 2:\n\n(The stage is now dark, but the audience can see the characters walking around the stage.)\n\n(The stage is now lit up, but the audience can only see the characters' silhouettes.)\n\n("
         new_tokens = 60
 
-        model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
+        model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
 
         model_q = GPTQModel.from_quantized(
             model_id,
@@ -48,25 +49,29 @@ def test_generation_no_act_order(self):
         # This one uses Autocast.
         res = model_q.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
         predicted_text = tokenizer.decode(res[0])
-        self.assertEqual(predicted_text, reference_output)
+
+        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])
 
         # This one does not.
         res = model_q.model.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
         predicted_text = tokenizer.decode(res[0])
-        self.assertEqual(predicted_text, reference_output)
 
-    def test_generation_with_act_order(self):
+        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])
+
+    def test_generation_desc_act_true(self):
         prompt = "I am in Paris and"
         device = torch.device("cuda:0")
 
         # Reference generated with the cuda-old kernel
-        reference_output = "<s> I am in Paris and I am so excited to be here. I am here for the first time in my life and I am so grateful for this opportunity. I am here to learn and to grow and to meet new people and to experience new things. I am here to see the Eiffel Tower and to walk along"
+        reference_output = "<s> I am in Paris and I am in love with you.\n\nScene 2:\n\nThe stage is now set in a Parisian café. The café is filled with people, including a group of friends, a couple, and a group of tourists. The friends are discussing their plans for the"
 
-        model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
+        model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
+        revision = "desc_act_true"
 
         model_q = GPTQModel.from_quantized(
             model_id,
             device="cuda:0",
+            revision=revision,
             use_triton=True,
             disable_exllama=True,
             disable_exllamav2=True,
@@ -85,4 +90,4 @@ def test_generation_with_act_order(self):
 
         predicted_text = tokenizer.decode(res[0])
 
-        self.assertEqual(predicted_text, reference_output)
+        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])