fix / skip failing tests

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
vllm-project · Feb 4, 2025 · c8f1951 · c8f1951
1 parent 35d7cfa
commit c8f1951
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 6 deletions.
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
@@ -308,8 +308,8 @@ def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
            model_runner.model)
 
 
-@pytest.fixture(params=[False, True])
-def run_with_both_engines_lora(request):
+@pytest.fixture(params=[True, False])
+def run_with_both_engines_lora(request, monkeypatch):
     # Automatically runs tests twice, once with V1 and once without
     use_v1 = request.param
     # Tests decorated with `@skip_v1` are only run without v1
@@ -318,8 +318,8 @@ def run_with_both_engines_lora(request):
     if use_v1:
         if skip_v1:
             pytest.skip("Skipping test on vllm V1")
-        with patch('vllm.envs.VLLM_USE_V1', True):
-            yield
+        monkeypatch.setenv('VLLM_USE_V1', '1')
     else:
-        with patch('vllm.envs.VLLM_USE_V1', False):
-            yield
+        monkeypatch.setenv('VLLM_USE_V1', '0')
+
+    yield
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
@@ -57,6 +57,7 @@ def v1(run_with_both_engines_lora):
     pass
 
 
+@pytest.mark.skip_v1
 @fork_new_process_for_each_test
 def test_chatglm3_lora(chatglm3_lora_files):
     llm = vllm.LLM(MODEL_PATH,
@@ -76,6 +77,7 @@ def test_chatglm3_lora(chatglm3_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
+@pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
 @fork_new_process_for_each_test
 def test_chatglm3_lora_tp4(chatglm3_lora_files):
@@ -97,6 +99,7 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
+@pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
 @fork_new_process_for_each_test
 def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):

diff --git a/tests/lora/test_lora_bias_e2e.py b/tests/lora/test_lora_bias_e2e.py
@@ -38,6 +38,9 @@ def v1(run_with_both_engines_lora):
     pass
 
 
+# Skipping for V1 for now as we are hitting,
+# "Head size 80 is not supported by FlashAttention." error.
+@pytest.mark.skip_v1
 @pytest.mark.parametrize("lora_bias", [True])
 @pytest.mark.parametrize("fully_sharded", [True, False])
 def test_lora_bias(lora_bias_files: str, lora_bias: bool, fully_sharded: bool):