From 0ab278ca31028a7623098b3c7d615ad350663d05 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Mon, 3 Jun 2024 09:39:31 -0700 Subject: [PATCH 1/3] [Core] Remove unnecessary copies in flash attn backend (#5138) --- requirements-cuda.txt | 2 +- vllm/attention/backends/flash_attn.py | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 5109f17356178..3536179835967 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -6,4 +6,4 @@ ray >= 2.9 nvidia-ml-py # for pynvml package torch == 2.3.0 xformers == 0.0.26.post1 # Requires PyTorch 2.3.0 -vllm-flash-attn == 2.5.8.post2 # Requires PyTorch 2.3.0 +vllm-flash-attn == 2.5.9 # Requires PyTorch 2.3.0 diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 0b9d6283493f2..070c074e511bc 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -317,7 +317,7 @@ def forward( # normal attention # When block_tables are not filled, it means q and k are the # prompt, and they have the same length. - out = flash_attn_varlen_func( + flash_attn_varlen_func( q=query, k=key, v=value, @@ -329,14 +329,13 @@ def forward( causal=True, window_size=self.sliding_window, alibi_slopes=self.alibi_slopes, + out=output[:num_prefill_tokens], ) - assert output[:num_prefill_tokens].shape == out.shape - output[:num_prefill_tokens] = out else: # prefix-enabled attention assert prefill_meta.seq_lens is not None max_seq_len = max(prefill_meta.seq_lens) - output[:num_prefill_tokens] = flash_attn_varlen_func( + flash_attn_varlen_func( q=query, k=key_cache, v=value_cache, @@ -348,11 +347,12 @@ def forward( causal=True, alibi_slopes=self.alibi_slopes, block_table=prefill_meta.block_tables, + out=output[:num_prefill_tokens], ) if decode_meta := attn_metadata.decode_metadata: # Decoding run. - output[num_prefill_tokens:] = flash_attn_with_kvcache( + flash_attn_with_kvcache( decode_query.unsqueeze(1), key_cache, value_cache, @@ -361,7 +361,8 @@ def forward( softmax_scale=self.scale, causal=True, alibi_slopes=self.alibi_slopes, - ).squeeze(1) + out=output[num_prefill_tokens:].unsqueeze(1), + ) # Reshape the output tensor. return output.view(num_tokens, hidden_size) From cbb2f59cc853731f5607ac0130bb6cdebfdc89c7 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Mon, 3 Jun 2024 12:52:30 -0400 Subject: [PATCH 2/3] [Kernel] Pass a device pointer into the quantize kernel for the scales (#5159) --- csrc/ops.h | 4 ++-- .../compressed_tensors/int8_quant_kernels.cu | 15 +++++++++------ tests/kernels/test_int8_quant.py | 4 +++- vllm/_custom_ops.py | 2 +- .../compressed_tensors_w8a8_statictensor.py | 2 +- 5 files changed, 16 insertions(+), 11 deletions(-) diff --git a/csrc/ops.h b/csrc/ops.h index 567d9fae4bd2a..4952e826ec8ac 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -94,8 +94,8 @@ int cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a, #endif -void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor& input, - float scale); +void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, + torch::Tensor const& scale); void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, torch::Tensor lookup_table); diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu index 4902e4c23434c..11baa5d414c19 100644 --- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu +++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu @@ -28,9 +28,10 @@ namespace vllm { template __global__ void static_scaled_int8_quant_kernel( const scalar_t* __restrict__ input, int8_t* __restrict__ out, - scale_type scale, const int hidden_size) { + const scale_type* scale_ptr, const int hidden_size) { const int tid = threadIdx.x; const int token_idx = blockIdx.x; + scale_type scale = *scale_ptr; for (int i = tid; i < hidden_size; i += blockDim.x) { out[token_idx * hidden_size + i] = @@ -39,11 +40,13 @@ __global__ void static_scaled_int8_quant_kernel( } } // namespace vllm -void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] - torch::Tensor& input, // [..., hidden_size] - float scale) { +void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] + torch::Tensor const& input, // [..., hidden_size] + torch::Tensor const& scale) { TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(out.is_contiguous()); + TORCH_CHECK(scale.numel() == 1); + int hidden_size = input.size(-1); int num_tokens = input.numel() / hidden_size; dim3 grid(num_tokens); @@ -53,7 +56,7 @@ void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] input.scalar_type(), "static_scaled_int8_quant_kernel", [&] { vllm::static_scaled_int8_quant_kernel <<>>(input.data_ptr(), - out.data_ptr(), scale, - hidden_size); + out.data_ptr(), + scale.data_ptr(), hidden_size); }); } diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py index b9aa00ce13f56..29890118c93dc 100644 --- a/tests/kernels/test_int8_quant.py +++ b/tests/kernels/test_int8_quant.py @@ -26,6 +26,8 @@ def test_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, torch.iinfo(torch.int8).min, torch.iinfo(torch.int8).max).to(torch.int8) out2 = torch.empty_like(x, dtype=torch.int8) - ops.static_scaled_int8_quant(out2, x, scale) + scale_argument = torch.tensor([scale], dtype=torch.float32, device="cuda") + + ops.static_scaled_int8_quant(out2, x, scale_argument) assert torch.allclose(out1, out2, atol=1) # big atol to account for rounding errors diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 22cf5a44e341f..8a6f6d96d81f3 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -265,7 +265,7 @@ def scaled_fp8_quant( # int8 def static_scaled_int8_quant(input: torch.Tensor, - scale: float) -> torch.Tensor: + scale: torch.Tensor) -> torch.Tensor: """ Quantize the input tensor to int8 and return the quantized tensor. diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py index 7e3e932cfe14a..2dfc6e2b07782 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py @@ -97,7 +97,7 @@ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor): act_scale = layer.input_scale # Input quantize - x_q = custom_ops.static_scaled_int8_quant(x, act_scale[0].item()) + x_q = custom_ops.static_scaled_int8_quant(x, act_scale) return custom_ops.cutlass_scaled_mm_dq(x_q, weight.t(), act_scale, weight_scale, x.dtype) From cafb8e06c5ffa359ac7fa4b53795e6eaa1a200c7 Mon Sep 17 00:00:00 2001 From: Yuan Date: Tue, 4 Jun 2024 01:39:50 +0800 Subject: [PATCH 3/3] [CI/BUILD] enable intel queue for longer CPU tests (#4113) --- .buildkite/run-cpu-test.sh | 14 +++- .buildkite/test-template.j2 | 2 + Dockerfile.cpu | 6 +- csrc/cpu/pos_encoding.cpp | 101 ++++++++++++++-------------- tests/conftest.py | 36 ++++++---- tests/models/test_aqlm.py | 11 +-- tests/models/test_big_models.py | 10 ++- tests/models/test_fp8.py | 11 +-- tests/models/test_gptq_marlin.py | 11 +-- tests/models/test_gptq_marlin_24.py | 11 +-- tests/models/test_marlin.py | 11 +-- 11 files changed, 136 insertions(+), 88 deletions(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 414045fe163e5..d1200ee84dfe4 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -10,5 +10,15 @@ remove_docker_container() { docker rm -f cpu-test || true; } trap remove_docker_container EXIT remove_docker_container -# Run the image and launch offline inference -docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py +# Run the image +docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test + +# offline inference +docker exec cpu-test bash -c "python3 examples/offline_inference.py" + +# Run basic model test +docker exec cpu-test bash -c "cd tests; + pip install pytest Pillow protobuf + bash ../.buildkite/download-images.sh + cd ../ + pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py" diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 265833e2ccf6e..7e986c988407c 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -40,6 +40,8 @@ steps: - label: "Intel Test" depends_on: ~ + agents: + queue: intel command: bash .buildkite/run-cpu-test.sh {% for step in steps %} diff --git a/Dockerfile.cpu b/Dockerfile.cpu index aec79824213f3..ae23e27b413ba 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -1,6 +1,6 @@ # This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform. -FROM ubuntu:22.04 +FROM ubuntu:22.04 AS cpu-test-1 RUN apt-get update -y \ && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \ @@ -9,6 +9,8 @@ RUN apt-get update -y \ RUN pip install --upgrade pip \ && pip install wheel packaging ninja setuptools>=49.4.0 numpy +FROM cpu-test-1 AS build + COPY ./ /workspace/vllm WORKDIR /workspace/vllm @@ -19,4 +21,6 @@ RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install WORKDIR /workspace/ +RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks + CMD ["/bin/bash"] diff --git a/csrc/cpu/pos_encoding.cpp b/csrc/cpu/pos_encoding.cpp index 73bf77e46f538..e8aead17ae5a7 100644 --- a/csrc/cpu/pos_encoding.cpp +++ b/csrc/cpu/pos_encoding.cpp @@ -21,73 +21,74 @@ void rotary_embedding_impl( constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num(); const int embed_dim = rot_dim / 2; - TORCH_CHECK(embed_dim % VEC_ELEM_NUM == 0); + bool flag = (embed_dim % VEC_ELEM_NUM == 0); + const int loop_upper = flag ? embed_dim : embed_dim - VEC_ELEM_NUM; -#pragma omp parallel for - for (int token_idx = 0; token_idx < num_tokens; ++token_idx) { - int64_t pos = positions[token_idx]; - const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim; + auto compute_loop = [&](const int64_t token_head, const scalar_t* cache_ptr, + scalar_t* qk) { + int j = 0; + for (; j < loop_upper; j += VEC_ELEM_NUM) { + const int rot_offset = j; + const int x_index = rot_offset; + const int y_index = embed_dim + rot_offset; - for (int i = 0; i < num_heads; ++i) { - const int head_idx = i; - const int64_t token_head = - token_idx * query_stride + head_idx * head_size; - for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) { - const int rot_offset = j; - const int x_index = rot_offset; - const int y_index = embed_dim + rot_offset; + const int64_t out_x = token_head + x_index; + const int64_t out_y = token_head + y_index; - const int64_t out_x = token_head + x_index; - const int64_t out_y = token_head + y_index; + const scalar_vec_t cos(cache_ptr + x_index); + const scalar_vec_t sin(cache_ptr + y_index); - const scalar_vec_t cos(cache_ptr + x_index); - const scalar_vec_t sin(cache_ptr + y_index); + const scalar_vec_t q_x(qk + out_x); + const scalar_vec_t q_y(qk + out_y); - const scalar_vec_t q_x(query + out_x); - const scalar_vec_t q_y(query + out_y); + vec_op::FP32Vec8 fp32_cos(cos); + vec_op::FP32Vec8 fp32_sin(sin); - vec_op::FP32Vec8 fp32_cos(cos); - vec_op::FP32Vec8 fp32_sin(sin); + vec_op::FP32Vec8 fp32_q_x(q_x); + vec_op::FP32Vec8 fp32_q_y(q_y); - vec_op::FP32Vec8 fp32_q_x(q_x); - vec_op::FP32Vec8 fp32_q_y(q_y); + auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin; + scalar_vec_t(out1).save(qk + out_x); - auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin; - scalar_vec_t(out1).save(query + out_x); - - auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin; - scalar_vec_t(out2).save(query + out_y); - } + auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin; + scalar_vec_t(out2).save(qk + out_y); } - - for (int i = 0; i < num_kv_heads; ++i) { - const int head_idx = i; - const int64_t token_head = token_idx * key_stride + head_idx * head_size; - for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) { - const int rot_offset = j; - const int x_index = rot_offset; - const int y_index = embed_dim + rot_offset; + if (!flag) { + for (; j < embed_dim; ++j) { + const int x_index = j; + const int y_index = embed_dim + j; const int64_t out_x = token_head + x_index; const int64_t out_y = token_head + y_index; - const scalar_vec_t cos(cache_ptr + x_index); - const scalar_vec_t sin(cache_ptr + y_index); + const float fp32_cos = cache_ptr[x_index]; + const float fp32_sin = cache_ptr[y_index]; - const scalar_vec_t k_x(key + out_x); - const scalar_vec_t k_y(key + out_y); + const float fp32_q_x = qk[out_x]; + const float fp32_q_y = qk[out_y]; - vec_op::FP32Vec8 fp32_cos(cos); - vec_op::FP32Vec8 fp32_sin(sin); + qk[out_x] = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin; + qk[out_y] = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin; + } + } + }; - vec_op::FP32Vec8 fp32_k_x(k_x); - vec_op::FP32Vec8 fp32_k_y(k_y); +#pragma omp parallel for + for (int token_idx = 0; token_idx < num_tokens; ++token_idx) { + int64_t pos = positions[token_idx]; + const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim; - auto out1 = fp32_k_x * fp32_cos - fp32_k_y * fp32_sin; - scalar_vec_t(out1).save(key + out_x); - auto out2 = fp32_k_y * fp32_cos + fp32_k_x * fp32_sin; - scalar_vec_t(out2).save(key + out_y); - } + for (int i = 0; i < num_heads; ++i) { + const int head_idx = i; + const int64_t token_head = + token_idx * query_stride + head_idx * head_size; + compute_loop(token_head, cache_ptr, query); + } + + for (int i = 0; i < num_kv_heads; ++i) { + const int head_idx = i; + const int64_t token_head = token_idx * key_stride + head_idx * head_size; + compute_loop(token_head, cache_ptr, key); } } } diff --git a/tests/conftest.py b/tests/conftest.py index e749338e1095a..764374a779d9e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,6 +18,7 @@ from vllm.multimodal import MultiModalData from vllm.multimodal.image import ImageFeatureData, ImagePixelData from vllm.sequence import SampleLogprobs +from vllm.utils import is_cpu logger = init_logger(__name__) @@ -58,7 +59,8 @@ def cleanup(): with contextlib.suppress(AssertionError): torch.distributed.destroy_process_group() gc.collect() - torch.cuda.empty_cache() + if not is_cpu(): + torch.cuda.empty_cache() @pytest.fixture() @@ -151,6 +153,12 @@ def example_long_prompts() -> List[str]: class HfRunner: + def wrap_device(self, input: any): + if not is_cpu(): + return input.to("cuda") + else: + return input.to("cpu") + def __init__( self, model_name: str, @@ -164,16 +172,18 @@ def __init__( if model_name in _EMBEDDING_MODELS: # Lazy init required for AMD CI from sentence_transformers import SentenceTransformer - self.model = SentenceTransformer( - model_name, - device="cpu", - ).to(dtype=torch_dtype).cuda() + self.model = self.wrap_device( + SentenceTransformer( + model_name, + device="cpu", + ).to(dtype=torch_dtype)) else: - self.model = AutoModelForCausalLM.from_pretrained( - model_name, - torch_dtype=torch_dtype, - trust_remote_code=True, - ).cuda() + self.model = self.wrap_device( + AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=torch_dtype, + trust_remote_code=True, + )) self.tokenizer = AutoTokenizer.from_pretrained( model_name, @@ -214,7 +224,7 @@ def generate( inputs = self.processor(**processor_kwargs) output_ids = self.model.generate( - **inputs.to("cuda"), + **self.wrap_device(inputs), use_cache=True, **kwargs, ) @@ -271,7 +281,7 @@ def generate_greedy_logprobs( for prompt in prompts: input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids output = self.model.generate( - input_ids.cuda(), + self.wrap_device(input_ids), use_cache=True, do_sample=False, max_new_tokens=max_tokens, @@ -306,7 +316,7 @@ def generate_greedy_logprobs_limit( for prompt in prompts: input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids output = self.model.generate( - input_ids.cuda(), + self.wrap_device(input_ids), use_cache=True, do_sample=False, max_new_tokens=max_tokens, diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index a7abc011f57d7..85d74f7f5b03d 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -8,10 +8,13 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -capability = torch.cuda.get_device_capability() -capability = capability[0] * 10 + capability[1] -aqlm_not_supported = (capability < - QUANTIZATION_METHODS["aqlm"].get_min_capability()) +aqlm_not_supported = True + +if torch.cuda.is_available(): + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + aqlm_not_supported = (capability < + QUANTIZATION_METHODS["aqlm"].get_min_capability()) # In this test we hardcode prompts and generations for the model so we don't # need to require the AQLM package as a dependency diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py index 10e7c64e34e75..ea95e6a49f03a 100644 --- a/tests/models/test_big_models.py +++ b/tests/models/test_big_models.py @@ -5,6 +5,7 @@ Run `pytest tests/models/test_big_models.py`. """ import pytest +import torch MODELS = [ "meta-llama/Llama-2-7b-hf", @@ -16,9 +17,14 @@ # "Qwen/Qwen1.5-0.5B" # Broken, ] +#TODO: remove this after CPU float16 support ready +target_dtype = "float" +if torch.cuda.is_available(): + target_dtype = "half" + @pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("dtype", [target_dtype]) @pytest.mark.parametrize("max_tokens", [32]) def test_models( hf_runner, @@ -46,7 +52,7 @@ def test_models( @pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("dtype", [target_dtype]) def test_model_print( vllm_runner, model: str, diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index 0a5819ea3f054..61aee0d0a6e93 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -67,10 +67,13 @@ }, } -capability = torch.cuda.get_device_capability() -capability = capability[0] * 10 + capability[1] -fp8_not_supported = (capability < - QUANTIZATION_METHODS["fp8"].get_min_capability()) +fp8_not_supported = True + +if torch.cuda.is_available(): + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + fp8_not_supported = (capability < + QUANTIZATION_METHODS["fp8"].get_min_capability()) @pytest.mark.skipif(fp8_not_supported, diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index 1fc0b3f239127..814471b47763d 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -22,10 +22,13 @@ MAX_MODEL_LEN = 1024 -capability = torch.cuda.get_device_capability() -capability = capability[0] * 10 + capability[1] -gptq_marlin_not_supported = ( - capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability()) +gptq_marlin_not_supported = True + +if torch.cuda.is_available(): + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + gptq_marlin_not_supported = ( + capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability()) MODELS = [ # act_order==False, group_size=channelwise diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py index 3e6ffb7f90fcc..cc35ee803ff01 100644 --- a/tests/models/test_gptq_marlin_24.py +++ b/tests/models/test_gptq_marlin_24.py @@ -14,10 +14,13 @@ from tests.models.utils import check_logprobs_close from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -capability = torch.cuda.get_device_capability() -capability = capability[0] * 10 + capability[1] -marlin_not_supported = (capability < - QUANTIZATION_METHODS["marlin"].get_min_capability()) +marlin_not_supported = True + +if torch.cuda.is_available(): + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + marlin_not_supported = ( + capability < QUANTIZATION_METHODS["marlin"].get_min_capability()) @dataclass diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index 37c1664afec55..8520b26718bf5 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -19,10 +19,13 @@ from .utils import check_logprobs_close -capability = torch.cuda.get_device_capability() -capability = capability[0] * 10 + capability[1] -marlin_not_supported = (capability < - QUANTIZATION_METHODS["marlin"].get_min_capability()) +marlin_not_supported = True + +if torch.cuda.is_available(): + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + marlin_not_supported = ( + capability < QUANTIZATION_METHODS["marlin"].get_min_capability()) @dataclass