Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Refactor]A simple device-related refactor #11163

Merged
merged 1 commit into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions vllm/platforms/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
"vllm.worker.cpu_worker.CPUWorker"
else:
parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"

@classmethod
def is_pin_memory_available(cls) -> bool:
logger.warning("Pin memory is not supported on CPU.")
return False
9 changes: 9 additions & 0 deletions vllm/platforms/hpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@

import torch

from vllm.logger import init_logger

from .interface import Platform, PlatformEnum, _Backend

if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None

logger = init_logger(__name__)


class HpuPlatform(Platform):
_enum = PlatformEnum.HPU
Expand Down Expand Up @@ -43,3 +47,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
parallel_config = vllm_config.parallel_config
if parallel_config.worker_cls == "auto":
parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker"

@classmethod
def is_pin_memory_available(cls):
logger.warning("Pin memory is not supported on HPU.")
return False
17 changes: 17 additions & 0 deletions vllm/platforms/interface.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import enum
import platform
import random
from platform import uname
from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union

import numpy as np
Expand All @@ -16,6 +17,11 @@
logger = init_logger(__name__)


def in_wsl() -> bool:
# Reference: https://github.com/microsoft/WSL/issues/4071
return "microsoft" in " ".join(uname()).lower()


class _Backend(enum.Enum):
FLASH_ATTN = enum.auto()
FLASH_ATTN_VLLM_V1 = enum.auto()
Expand Down Expand Up @@ -221,6 +227,17 @@ def get_cpu_architecture(cls) -> CpuArchEnum:

return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN

@classmethod
def is_pin_memory_available(cls) -> bool:
"""Checks whether pin memory is available on the current platform."""
if in_wsl():
# Pinning memory in WSL is not supported.
# https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
logger.warning("Using 'pin_memory=False' as WSL is detected. "
"This may slow down the performance.")
return False
return True


class UnspecifiedPlatform(Platform):
_enum = PlatformEnum.UNSPECIFIED
Expand Down
9 changes: 9 additions & 0 deletions vllm/platforms/neuron.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
from typing import TYPE_CHECKING, Optional

from vllm.logger import init_logger

from .interface import Platform, PlatformEnum

if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None

logger = init_logger(__name__)


class NeuronPlatform(Platform):
_enum = PlatformEnum.NEURON
Expand All @@ -28,3 +32,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
if parallel_config.worker_cls == "auto":
parallel_config.worker_cls = \
"vllm.worker.neuron_worker.NeuronWorker"

@classmethod
def is_pin_memory_available(cls) -> bool:
logger.warning("Pin memory is not supported on Neuron.")
return False
10 changes: 5 additions & 5 deletions vllm/platforms/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,27 +34,27 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
return _Backend.OPENVINO

@classmethod
def get_device_name(self, device_id: int = 0) -> str:
def get_device_name(cls, device_id: int = 0) -> str:
return "openvino"

@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
return False

@classmethod
def inference_mode(self):
def inference_mode(cls):
return torch.inference_mode(mode=True)

@classmethod
def is_openvino_cpu(self) -> bool:
def is_openvino_cpu(cls) -> bool:
return "CPU" in envs.VLLM_OPENVINO_DEVICE

@classmethod
def is_openvino_gpu(self) -> bool:
def is_openvino_gpu(cls) -> bool:
return "GPU" in envs.VLLM_OPENVINO_DEVICE

@classmethod
def is_pin_memory_available(self) -> bool:
def is_pin_memory_available(cls) -> bool:
logger.warning("Pin memory is not supported on OpenViNO.")
return False

Expand Down
5 changes: 5 additions & 0 deletions vllm/platforms/xpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
parallel_config.distributed_executor_backend = "ray"
if parallel_config.worker_cls == "auto":
parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker"

@classmethod
def is_pin_memory_available(cls):
logger.warning("Pin memory is not supported on XPU.")
return False
27 changes: 1 addition & 26 deletions vllm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
from collections import UserDict, defaultdict
from collections.abc import Iterable, Mapping
from functools import lru_cache, partial, wraps
from platform import uname
from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
Dict, Generic, Hashable, List, Literal, Optional,
OrderedDict, Set, Tuple, Type, TypeVar, Union, overload)
Expand Down Expand Up @@ -344,12 +343,6 @@ def random_uuid() -> str:
return str(uuid.uuid4().hex)


@lru_cache(maxsize=None)
def in_wsl() -> bool:
# Reference: https://github.com/microsoft/WSL/issues/4071
return "microsoft" in " ".join(uname()).lower()


def make_async(
func: Callable[P, T],
executor: Optional[concurrent.futures.Executor] = None
Expand Down Expand Up @@ -729,25 +722,7 @@ def print_warning_once(msg: str) -> None:

@lru_cache(maxsize=None)
def is_pin_memory_available() -> bool:

if in_wsl():
# Pinning memory in WSL is not supported.
# https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
print_warning_once("Using 'pin_memory=False' as WSL is detected. "
"This may slow down the performance.")
return False
elif current_platform.is_xpu():
print_warning_once("Pin memory is not supported on XPU.")
return False
elif current_platform.is_neuron():
print_warning_once("Pin memory is not supported on Neuron.")
return False
elif current_platform.is_hpu():
print_warning_once("Pin memory is not supported on HPU.")
return False
elif current_platform.is_cpu() or current_platform.is_openvino():
return False
return True
return current_platform.is_pin_memory_available()


class DeviceMemoryProfiler:
Expand Down
Loading