diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c index edabb864c3d..ce8b0f77484 100644 --- a/opal/mca/accelerator/cuda/accelerator_cuda.c +++ b/opal/mca/accelerator/cuda/accelerator_cuda.c @@ -6,6 +6,7 @@ * All rights reserved. * Copyright (c) Amazon.com, Inc. or its affiliates. * All Rights reserved. + * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -106,6 +107,14 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module = accelerator_cuda_get_buffer_id }; +static inline opal_accelerator_cuda_delayed_init_check(void) +{ + if (OPAL_UNLIKELY(true != mca_accelerator_cuda_init_complete)) { + return opal_accelerator_cuda_delayed_init(); + } + return OPAL_SUCCESS; +} + static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags) { CUresult result; @@ -236,15 +245,15 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t * } } /* First access on a device pointer finalizes CUDA support initialization. */ - opal_accelerator_cuda_delayed_init(); + (void)opal_accelerator_cuda_delayed_init_check(); return 1; } static int accelerator_cuda_create_stream(int dev_id, opal_accelerator_stream_t **stream) { CUresult result; - int delayed_init = opal_accelerator_cuda_delayed_init(); - if (OPAL_UNLIKELY(0 != delayed_init)) { + int delayed_init = opal_accelerator_cuda_delayed_init_check(); + if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) { return delayed_init; } *stream = (opal_accelerator_stream_t*)OBJ_NEW(opal_accelerator_cuda_stream_t); @@ -293,8 +302,8 @@ OBJ_CLASS_INSTANCE( static int accelerator_cuda_create_event(int dev_id, opal_accelerator_event_t **event, bool enable_ipc) { CUresult result; - int delayed_init = opal_accelerator_cuda_delayed_init(); - if (OPAL_UNLIKELY(0 != delayed_init)) { + int delayed_init = opal_accelerator_cuda_delayed_init_check(); + if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) { return delayed_init; } @@ -396,8 +405,8 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void * { CUresult result; - int delayed_init = opal_accelerator_cuda_delayed_init(); - if (OPAL_UNLIKELY(0 != delayed_init)) { + int delayed_init = opal_accelerator_cuda_delayed_init_check(); + if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) { return delayed_init; } @@ -423,8 +432,8 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest, { CUresult result; - int delayed_init = opal_accelerator_cuda_delayed_init(); - if (OPAL_UNLIKELY(0 != delayed_init)) { + int delayed_init = opal_accelerator_cuda_delayed_init_check(); + if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) { return delayed_init; } @@ -464,8 +473,8 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, CUdeviceptr tmp; CUresult result; - int delayed_init = opal_accelerator_cuda_delayed_init(); - if (OPAL_UNLIKELY(0 != delayed_init)) { + int delayed_init = opal_accelerator_cuda_delayed_init_check(); + if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) { return delayed_init; } @@ -503,8 +512,8 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size) { CUresult result; - int delayed_init = opal_accelerator_cuda_delayed_init(); - if (OPAL_UNLIKELY(0 != delayed_init)) { + int delayed_init = opal_accelerator_cuda_delayed_init_check(); + if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) { return delayed_init; } @@ -542,8 +551,8 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void { CUresult result; - int delayed_init = opal_accelerator_cuda_delayed_init(); - if (OPAL_UNLIKELY(0 != delayed_init)) { + int delayed_init = opal_accelerator_cuda_delayed_init_check(); + if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) { return delayed_init; } @@ -566,25 +575,80 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void static bool accelerator_cuda_is_ipc_enabled(void) { - return false; + return true; +} + +static void mca_accelerator_cuda_ipc_handle_destruct(opal_accelerator_cuda_ipc_handle_t *handle) +{ + if (NULL != handle && NULL != handle->base.dev_ptr) { + cuIpcCloseMemHandle((CUdeviceptr) handle->base.dev_ptr); + handle->base.dev_ptr = NULL; + } } +OBJ_CLASS_INSTANCE( + opal_accelerator_cuda_ipc_handle_t, + opal_accelerator_ipc_handle_t, + NULL, + mca_accelerator_cuda_ipc_handle_destruct); + static int accelerator_cuda_get_ipc_handle(int dev_id, void *dev_ptr, opal_accelerator_ipc_handle_t *handle) { - return OPAL_ERR_NOT_IMPLEMENTED; + if (NULL == dev_ptr || NULL == handle) { + return OPAL_ERR_BAD_PARAM; + } + + CUipcMemHandle cuda_ipc_handle; + opal_accelerator_cuda_ipc_handle_t *cuda_handle = (opal_accelerator_cuda_ipc_handle_t *) handle; + + OBJ_CONSTRUCT(cuda_handle, opal_accelerator_cuda_ipc_handle_t); + cuda_handle->base.dev_ptr = NULL; + + CUresult err = cuIpcGetMemHandle(&cuda_ipc_handle, + (CUdeviceptr)dev_ptr); + if (OPAL_UNLIKELY(CUDA_SUCCESS != err)) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "Error in cuIpcGetMemHandle dev_ptr %p", dev_ptr); + OBJ_DESTRUCT(cuda_handle); + return OPAL_ERROR; + } + memcpy(cuda_handle->base.handle, &cuda_ipc_handle, IPC_MAX_HANDLE_SIZE); + + return OPAL_SUCCESS; } static int accelerator_cuda_import_ipc_handle(int dev_id, uint8_t ipc_handle[IPC_MAX_HANDLE_SIZE], opal_accelerator_ipc_handle_t *handle) { - return OPAL_ERR_NOT_IMPLEMENTED; + opal_accelerator_cuda_ipc_handle_t *cuda_handle = (opal_accelerator_cuda_ipc_handle_t *) handle; + OBJ_CONSTRUCT(cuda_handle, opal_accelerator_cuda_ipc_handle_t); + memcpy(cuda_handle->base.handle, ipc_handle, IPC_MAX_HANDLE_SIZE); + + return OPAL_SUCCESS; } static int accelerator_cuda_open_ipc_handle(int dev_id, opal_accelerator_ipc_handle_t *handle, void **dev_ptr) { - return OPAL_ERR_NOT_IMPLEMENTED; + if (NULL == dev_ptr || NULL == handle) { + return OPAL_ERR_BAD_PARAM; + } + + CUresult err = cuIpcOpenMemHandle((CUdeviceptr *) &handle->dev_ptr, + *(CUipcMemHandle*)handle->handle, + CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); + if (CUDA_ERROR_ALREADY_MAPPED == err) { + return OPAL_ERR_WOULD_BLOCK; + } + else if (CUDA_SUCCESS != err) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "error in cuIpcOpenMemHandle"); + return OPAL_ERROR; + } + *dev_ptr = handle->dev_ptr; + + return OPAL_SUCCESS; } static int accelerator_cuda_compare_ipc_handles(uint8_t handle_1[IPC_MAX_HANDLE_SIZE], @@ -593,29 +657,84 @@ static int accelerator_cuda_compare_ipc_handles(uint8_t handle_1[IPC_MAX_HANDLE_ return memcmp(handle_1, handle_2, IPC_MAX_HANDLE_SIZE); } +static void mca_accelerator_cuda_ipc_event_handle_destruct(opal_accelerator_cuda_ipc_handle_t *handle) +{ + // Just a place holder, there is no cuIpcCloseEventHandle. +} + +OBJ_CLASS_INSTANCE( + opal_accelerator_cuda_ipc_event_handle_t, + opal_accelerator_ipc_event_handle_t, + NULL, + mca_accelerator_cuda_ipc_event_handle_destruct); + static int accelerator_cuda_get_ipc_event_handle(opal_accelerator_event_t *event, opal_accelerator_ipc_event_handle_t *handle) { - return OPAL_ERR_NOT_IMPLEMENTED; + if (NULL == event || NULL == handle) { + return OPAL_ERR_BAD_PARAM; + } + + CUipcEventHandle cuda_ipc_handle; + opal_accelerator_cuda_ipc_event_handle_t *cuda_handle = (opal_accelerator_cuda_ipc_event_handle_t *) handle; + OBJ_CONSTRUCT(cuda_handle, opal_accelerator_cuda_ipc_event_handle_t); + + memset(cuda_ipc_handle.reserved, 0, CU_IPC_HANDLE_SIZE); + CUresult err = cuIpcGetEventHandle(&cuda_ipc_handle, + *((CUevent *)event->event)); + if (OPAL_UNLIKELY(CUDA_SUCCESS != err)) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "error in cuIpcGetEventHandle"); + OBJ_DESTRUCT(cuda_handle); + return OPAL_ERROR; + } + memcpy(cuda_handle->base.handle, &cuda_ipc_handle, IPC_MAX_HANDLE_SIZE); + + return OPAL_SUCCESS; } static int accelerator_cuda_import_ipc_event_handle(uint8_t ipc_handle[IPC_MAX_HANDLE_SIZE], opal_accelerator_ipc_event_handle_t *handle) { - return OPAL_ERR_NOT_IMPLEMENTED; + opal_accelerator_cuda_ipc_handle_t *cuda_handle = (opal_accelerator_cuda_ipc_handle_t *) handle; + + OBJ_CONSTRUCT(cuda_handle, opal_accelerator_cuda_ipc_handle_t); + memcpy(cuda_handle->base.handle, ipc_handle, IPC_MAX_HANDLE_SIZE); + + return OPAL_SUCCESS; } static int accelerator_cuda_open_ipc_event_handle(opal_accelerator_ipc_event_handle_t *handle, opal_accelerator_event_t *event) { - return OPAL_ERR_NOT_IMPLEMENTED; + if (NULL == event || NULL == handle) { + return OPAL_ERR_BAD_PARAM; + } + + opal_accelerator_cuda_ipc_event_handle_t *cuda_handle = (opal_accelerator_cuda_ipc_event_handle_t *) handle; + opal_accelerator_cuda_event_t *cuda_event = (opal_accelerator_cuda_event_t *) event; + OBJ_CONSTRUCT(cuda_event, opal_accelerator_cuda_event_t); + cuda_event->base.event = malloc(sizeof(CUevent)); + if (NULL == cuda_event->base.event) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + CUresult err = cuIpcOpenEventHandle( (CUevent *)cuda_event->base.event, + *((CUipcEventHandle*)cuda_handle->base.handle)); + if (OPAL_UNLIKELY(CUDA_SUCCESS != err)) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "error in cuIpcOpenEventHandle"); + return OPAL_ERROR; + } + + return OPAL_SUCCESS; } static int accelerator_cuda_host_register(int dev_id, void *ptr, size_t size) { CUresult result; - int delayed_init = opal_accelerator_cuda_delayed_init(); - if (OPAL_UNLIKELY(0 != delayed_init)) { + int delayed_init = opal_accelerator_cuda_delayed_init_check(); + if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) { return delayed_init; } @@ -652,8 +771,8 @@ static int accelerator_cuda_get_device(int *dev_id) CUdevice cuDev; CUresult result; - int delayed_init = opal_accelerator_cuda_delayed_init(); - if (OPAL_UNLIKELY(0 != delayed_init)) { + int delayed_init = opal_accelerator_cuda_delayed_init_check(); + if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) { return delayed_init; } @@ -714,8 +833,8 @@ static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int de { CUresult result; - int delayed_init = opal_accelerator_cuda_delayed_init(); - if (OPAL_UNLIKELY(0 != delayed_init)) { + int delayed_init = opal_accelerator_cuda_delayed_init_check(); + if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) { return delayed_init; } @@ -744,8 +863,8 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc CUresult result; int enable = 1; - int delayed_init = opal_accelerator_cuda_delayed_init(); - if (OPAL_UNLIKELY(0 != delayed_init)) { + int delayed_init = opal_accelerator_cuda_delayed_init_check(); + if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) { return delayed_init; } diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.h b/opal/mca/accelerator/cuda/accelerator_cuda.h index 694a4192231..8d3529ce5ff 100644 --- a/opal/mca/accelerator/cuda/accelerator_cuda.h +++ b/opal/mca/accelerator/cuda/accelerator_cuda.h @@ -2,6 +2,7 @@ * Copyright (c) 2014 Intel, Inc. All rights reserved. * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates. * All Rights reserved. + * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -37,15 +38,28 @@ struct opal_accelerator_cuda_event_t { typedef struct opal_accelerator_cuda_event_t opal_accelerator_cuda_event_t; OBJ_CLASS_DECLARATION(opal_accelerator_cuda_event_t); +struct opal_accelerator_cuda_ipc_handle_t { + opal_accelerator_ipc_handle_t base; +}; +typedef struct opal_accelerator_cuda_ipc_handle_t opal_accelerator_cuda_ipc_handle_t; +OBJ_CLASS_DECLARATION(opal_accelerator_cuda_ipc_handle_t); + +struct opal_accelerator_cuda_ipc_event_handle_t { + opal_accelerator_ipc_event_handle_t base; +}; +typedef struct opal_accelerator_cuda_ipc_event_handle_t opal_accelerator_cuda_ipc_event_handle_t; +OBJ_CLASS_DECLARATION(opal_accelerator_cuda_ipc_event_handle_t); + /* Declare extern variables, defined in accelerator_cuda_component.c */ -OPAL_DECLSPEC extern CUstream opal_accelerator_cuda_memcpy_stream; -OPAL_DECLSPEC extern opal_mutex_t opal_accelerator_cuda_stream_lock; +extern CUstream opal_accelerator_cuda_memcpy_stream; +extern opal_mutex_t opal_accelerator_cuda_stream_lock; +extern bool mca_accelerator_cuda_init_complete; OPAL_DECLSPEC extern opal_accelerator_cuda_component_t mca_accelerator_cuda_component; -OPAL_DECLSPEC extern opal_accelerator_base_module_t opal_accelerator_cuda_module; +extern opal_accelerator_base_module_t opal_accelerator_cuda_module; -OPAL_DECLSPEC extern int opal_accelerator_cuda_delayed_init(void); +extern int opal_accelerator_cuda_delayed_init(void); END_C_DECLS diff --git a/opal/mca/accelerator/cuda/accelerator_cuda_component.c b/opal/mca/accelerator/cuda/accelerator_cuda_component.c index d48e29c9f65..b7baffd6aec 100644 --- a/opal/mca/accelerator/cuda/accelerator_cuda_component.c +++ b/opal/mca/accelerator/cuda/accelerator_cuda_component.c @@ -6,6 +6,7 @@ * reserved. * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates. * All Rights reserved. + * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -39,7 +40,7 @@ opal_mutex_t opal_accelerator_cuda_stream_lock = {0}; /* Initialization lock for delayed cuda initialization */ static opal_mutex_t accelerator_cuda_init_lock; -static bool accelerator_cuda_init_complete = false; +bool mca_accelerator_cuda_init_complete = false; #define STRINGIFY2(x) #x #define STRINGIFY(x) STRINGIFY2(x) @@ -127,13 +128,13 @@ int opal_accelerator_cuda_delayed_init() /* Double checked locking to avoid having to * grab locks post lazy-initialization. */ opal_atomic_rmb(); - if (true == accelerator_cuda_init_complete) { + if (true == mca_accelerator_cuda_init_complete) { return OPAL_SUCCESS; } OPAL_THREAD_LOCK(&accelerator_cuda_init_lock); /* If already initialized, just exit */ - if (true == accelerator_cuda_init_complete) { + if (true == mca_accelerator_cuda_init_complete) { goto out; } @@ -141,6 +142,7 @@ int opal_accelerator_cuda_delayed_init() * so, all is good. If not, then disable registration of memory. */ result = cuCtxGetCurrent(&cuContext); if (CUDA_SUCCESS != result) { + result = OPAL_ERR_NOT_INITIALIZED; opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent failed"); goto out; } else if ((CUDA_SUCCESS == result) && (NULL == cuContext)) { @@ -172,7 +174,7 @@ int opal_accelerator_cuda_delayed_init() } result = OPAL_SUCCESS; opal_atomic_wmb(); - accelerator_cuda_init_complete = true; + mca_accelerator_cuda_init_complete = true; out: OPAL_THREAD_UNLOCK(&accelerator_cuda_init_lock); return result; @@ -189,7 +191,7 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void) return NULL; } - opal_accelerator_cuda_delayed_init(); + (void)opal_accelerator_cuda_delayed_init(); return &opal_accelerator_cuda_module; } diff --git a/opal/mca/accelerator/rocm/accelerator_rocm.h b/opal/mca/accelerator/rocm/accelerator_rocm.h index 38409778ad4..8eab728b4d2 100644 --- a/opal/mca/accelerator/rocm/accelerator_rocm.h +++ b/opal/mca/accelerator/rocm/accelerator_rocm.h @@ -41,7 +41,7 @@ typedef struct { } opal_accelerator_rocm_component_t; OPAL_DECLSPEC extern opal_accelerator_rocm_component_t mca_accelerator_rocm_component; -OPAL_DECLSPEC extern opal_accelerator_base_module_t opal_accelerator_rocm_module; +extern opal_accelerator_base_module_t opal_accelerator_rocm_module; struct opal_accelerator_rocm_stream_t { opal_accelerator_stream_t base; @@ -67,12 +67,12 @@ struct opal_accelerator_rocm_ipc_event_handle_t { typedef struct opal_accelerator_rocm_ipc_event_handle_t opal_accelerator_rocm_ipc_event_handle_t; OBJ_CLASS_DECLARATION(opal_accelerator_rocm_ipc_event_handle_t); -OPAL_DECLSPEC extern hipStream_t opal_accelerator_rocm_MemcpyStream; -OPAL_DECLSPEC extern int opal_accelerator_rocm_memcpy_async; -OPAL_DECLSPEC extern int opal_accelerator_rocm_verbose; -OPAL_DECLSPEC extern size_t opal_accelerator_rocm_memcpyH2D_limit; -OPAL_DECLSPEC extern size_t opal_accelerator_rocm_memcpyD2H_limit; +extern hipStream_t opal_accelerator_rocm_MemcpyStream; +extern int opal_accelerator_rocm_memcpy_async; +extern int opal_accelerator_rocm_verbose; +extern size_t opal_accelerator_rocm_memcpyH2D_limit; +extern size_t opal_accelerator_rocm_memcpyD2H_limit; -OPAL_DECLSPEC extern int opal_accelerator_rocm_lazy_init(void); +extern int opal_accelerator_rocm_lazy_init(void); #endif diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_component.c b/opal/mca/accelerator/rocm/accelerator_rocm_component.c index 8f1bbbb53a5..2f40c0e35f5 100644 --- a/opal/mca/accelerator/rocm/accelerator_rocm_component.c +++ b/opal/mca/accelerator/rocm/accelerator_rocm_component.c @@ -176,6 +176,7 @@ int opal_accelerator_rocm_lazy_init() err = hipStreamCreate(&opal_accelerator_rocm_MemcpyStream); if (hipSuccess != err) { + err = OPAL_ERROR; // we got hipErrorInvalidValue, pretty bad opal_output(0, "Could not create hipStream, err=%d %s\n", err, hipGetErrorString(err)); goto out;