test

vllm-project · May 20, 2024 · 7369d7c · 7369d7c
1 parent db2d034
commit 7369d7c
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 33 deletions.
diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
@@ -16,31 +16,55 @@
 MAX_MODEL_LEN = 1024
 
 MODELS = [
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV",
     "meta-llama/Meta-Llama-3-8B-Instruct",
 ]
 
 EXPECTED_STRS_MAP = {
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8": [
-        'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
-        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-        'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-        'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
-        'Zeta-5, a highly advanced robot designed for menial labor, whirred to a',
-        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
-        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-        'Here are the translations:\n\n**Japanese:** (Haya aki no tori, guri o',
-    ],
-    "meta-llama/Meta-Llama-3-8B-Instruct": [
-        'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
-        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-        'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-        'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
-        'In the vast, sterile laboratory, Robot 3456-Alpha, or "Alpha" for short',
-        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
-        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-        'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu'
-    ],
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV": {
+        "auto": [
+            'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
+            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
+            'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
+            'Zeta-5, a highly advanced robot designed for menial labor, whirred to a',
+            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
+            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+            'Here are the translations:\n\n**Japanese:** (Haya aki no tori, guri o',
+        ],
+        "fp8": [
+            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
+            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
+            'A neural network is a complex system made up of several basic components that work together to enable it to',
+            'Zeta-5, a highly advanced robot designed for menial labor, had never experienced anything like',
+            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here',
+            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+            'Here are the translations:\n\n**Japanese:** (Haya kotori wa mushi o tsuk'
+        ]
+    },
+    "meta-llama/Meta-Llama-3-8B-Instruct": {
+        "auto": [
+            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
+            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
+            'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
+            'In the vast, sterile laboratory, Robot 3456-Alpha, or "Alpha" for short',
+            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
+            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+            'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu'
+        ],
+        "fp8": [
+            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
+            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
+            'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
+            'In the year 2154, robotics engineer Dr. Rachel Kim had spent years perfecting her latest',
+            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
+            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+            'Here are the translations:\n\n**Japanese:** (Haya tori, mushi o tsukamu'
+        ]
+    },
 }
 
 capability = torch.cuda.get_device_capability()
@@ -51,15 +75,15 @@
 
 @pytest.mark.skipif(fp8_not_supported,
                     reason="fp8 is not supported on this GPU type.")
-@pytest.mark.parametrize("model_name", MODELS)
-def test_models(
-    example_prompts,
-    model_name,
-) -> None:
+@pytest.mark.parametrize("model_name", ["meta-llama/Meta-Llama-3-8B-Instruct"])
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
+def test_models(example_prompts, model_name, kv_cache_dtype) -> None:
     model = LLM(model=model_name,
                 max_model_len=MAX_MODEL_LEN,
+                trust_remote_code=True,
                 enforce_eager=True,
-                quantization="fp8")
+                quantization="fp8",
+                kv_cache_dtype=kv_cache_dtype)
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     formatted_prompts = [
@@ -82,7 +106,7 @@ def test_models(
     del model
 
     print(generations)
-    expected_strs = EXPECTED_STRS_MAP[model_name]
+    expected_strs = EXPECTED_STRS_MAP[model_name][kv_cache_dtype]
     for i in range(len(example_prompts)):
         generated_str = generations[i]
         expected_str = expected_strs[i]

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -272,18 +272,18 @@ def create_weights(self, layer: torch.nn.Module):
         # Initialize the KV cache scale to 1.0 as the default value.
         # If the kv_scale appears in the checkpoint, it will be
         # overwritten when loading weights.
-        layer.kv_scale = Parameter(torch.ones(1, dtype=torch.float32),
-                                   requires_grad=False)
+        layer.kv_scale = Parameter(torch.tensor(1.0), requires_grad=False)
 
     def apply(self, layer: torch.nn.Module) -> torch.Tensor:
         raise RuntimeError("Fp8KVCacheMethod.apply should not be called.")
 
     def process_weights_after_loading(self, layer: Module) -> None:
-        kv_scales = layer.kv_scale.to("cpu").tolist()
-        if len(kv_scales) > 1:
+        kv_scale = layer.kv_scale.to("cpu").tolist()
+        del layer.kv_scale
+        if not isinstance(kv_scale, float):
             raise ValueError("Only support per-tensor scaling factor "
                              "for fp8 KV cache")
-        layer._kv_scale = kv_scales[0]
+        layer._kv_scale = 1.0  # kv_scale
         if layer._kv_scale == 1.0:
             print_warning_once(
                 "Using KV cache scaling factor 1.0 for fp8_e4m3. This may "