Skip to content

Commit

Permalink
[core] improve error handling when wake up from sleep mode (#12981)
Browse files Browse the repository at this point in the history
Signed-off-by: youkaichao <youkaichao@gmail.com>
  • Loading branch information
youkaichao authored Feb 10, 2025
1 parent 29f1d47 commit 59fff4a
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 12 deletions.
63 changes: 51 additions & 12 deletions csrc/cumem_allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,21 @@ extern "C" {
#include <cuda_runtime_api.h>
#include <cuda.h>

#define CUDA_CHECK(condition) \
do { \
CUresult error = condition; \
if (error != 0) { \
char* error_string; \
cuGetErrorString(error, (const char**)&error_string); \
std::cerr << "CUDA Error: " << error_string << " at " << __FILE__ << ":" \
<< __LINE__ << std::endl; \
} \
char error_msg[10240]; // 10KB buffer to store error messages
CUresult no_error = CUresult(0);
CUresult error_code = no_error; // store error code

#define CUDA_CHECK(condition) \
do { \
CUresult error = condition; \
if (error != 0) { \
error_code = error; \
char* error_string; \
cuGetErrorString(error, (const char**)&error_string); \
snprintf(error_msg, sizeof(error_msg), "CUDA Error: %s at %s:%d", \
error_string, __FILE__, __LINE__); \
std::cerr << error_msg << std::endl; \
} \
} while (0)

// Global references to Python callables
Expand Down Expand Up @@ -54,14 +60,22 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,

// Allocate memory using cuMemCreate
CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
if (error_code != 0) {
return;
}
CUDA_CHECK(cuMemMap(d_mem, size, 0, *p_memHandle, 0));

if (error_code != 0) {
return;
}
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = device;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;

CUDA_CHECK(cuMemSetAccess(d_mem, size, &accessDesc, 1));
if (error_code != 0) {
return;
}
// std::cout << "create_and_map: device=" << device << ", size=" << size << ",
// d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
}
Expand All @@ -73,7 +87,13 @@ void unmap_and_release(unsigned long long device, ssize_t size,
// ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
ensure_context(device);
CUDA_CHECK(cuMemUnmap(d_mem, size));
if (error_code != 0) {
return;
}
CUDA_CHECK(cuMemRelease(*p_memHandle));
if (error_code != 0) {
return;
}
}

PyObject* create_tuple_from_c_integers(unsigned long long a,
Expand Down Expand Up @@ -121,12 +141,16 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
size_t granularity;
CUDA_CHECK(cuMemGetAllocationGranularity(&granularity, &prop,
CU_MEM_ALLOC_GRANULARITY_MINIMUM));

if (error_code != 0) {
return nullptr;
}
size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;

CUdeviceptr d_mem;
CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, 0, 0, 0));

if (error_code != 0) {
return nullptr;
}
// allocate the CUmemGenericAllocationHandle
CUmemGenericAllocationHandle* p_memHandle =
(CUmemGenericAllocationHandle*)malloc(
Expand Down Expand Up @@ -208,6 +232,9 @@ void my_free(void* ptr, ssize_t size, int device, CUstream stream) {

// free address and the handle
CUDA_CHECK(cuMemAddressFree(d_mem, size));
if (error_code != 0) {
return;
}
free(p_memHandle);
}

Expand Down Expand Up @@ -258,6 +285,12 @@ static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) {

unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle);

if (error_code != 0) {
error_code = no_error;
PyErr_SetString(PyExc_RuntimeError, error_msg);
return nullptr;
}

Py_RETURN_NONE;
}

Expand All @@ -282,6 +315,12 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) {

create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle);

if (error_code != 0) {
error_code = no_error;
PyErr_SetString(PyExc_RuntimeError, error_msg);
return nullptr;
}

Py_RETURN_NONE;
}

Expand Down
27 changes: 27 additions & 0 deletions tests/basic_correctness/test_cumem.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0

import pytest
import torch

from vllm import LLM, SamplingParams
Expand All @@ -9,6 +10,32 @@
from ..utils import fork_new_process_for_each_test


@fork_new_process_for_each_test
def test_python_error():
"""
Test if Python error occurs when there's low-level
error happening from the C++ side.
"""
allocator = CuMemAllocator.get_instance()
total_bytes = torch.cuda.mem_get_info()[1]
alloc_bytes = int(total_bytes * 0.7)
tensors = []
with allocator.use_memory_pool():
# allocate 70% of the total memory
x = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
tensors.append(x)
# release the memory
allocator.sleep()

# allocate more memory than the total memory
y = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
tensors.append(y)
with pytest.raises(RuntimeError):
# when the allocator is woken up, it should raise an error
# because we don't have enough memory
allocator.wake_up()


@fork_new_process_for_each_test
def test_basic_cumem():
# some tensors from default memory pool
Expand Down

0 comments on commit 59fff4a

Please sign in to comment.