From c214e6d00e9114fc3165dabd899c9aa3c61a3207 Mon Sep 17 00:00:00 2001 From: "Akio.Hayakawa" Date: Tue, 25 Jun 2019 08:48:12 +0900 Subject: [PATCH] add stream_event_handler API --- include/nbla/cuda/init.hpp | 21 ++++++ python/src/nnabla_ext/cuda/__init__.py | 3 +- python/src/nnabla_ext/cuda/init.pyx | 66 +++++++++++++++++- src/nbla/cuda/init.cpp.tmpl | 92 ++++++++++++++++++++++++++ 4 files changed, 180 insertions(+), 2 deletions(-) diff --git a/include/nbla/cuda/init.hpp b/include/nbla/cuda/init.hpp index 0b85d8832..c5f50ee7e 100644 --- a/include/nbla/cuda/init.hpp +++ b/include/nbla/cuda/init.hpp @@ -17,6 +17,7 @@ #include +#include #include #include @@ -24,6 +25,7 @@ namespace nbla { using std::vector; using std::string; +using std::shared_ptr; /** Initialize CUDA features. @@ -53,5 +55,24 @@ NBLA_CUDA_API int cuda_get_device_count(); /** get available devices. */ NBLA_CUDA_API vector cuda_get_devices(); + +/** cudaStream wrapper functions. +*/ +NBLA_CUDA_API shared_ptr cuda_create_stream(int device_id = -1); + +NBLA_CUDA_API void *cuda_stream_shared_to_void(shared_ptr stream); +NBLA_CUDA_API void print_stream_flag(shared_ptr stream); +NBLA_CUDA_API void print_stream_priority(shared_ptr stream); +NBLA_CUDA_API void cuda_stream_synchronize(shared_ptr stream); +NBLA_CUDA_API void cuda_nullstream_synchronize(); +NBLA_CUDA_API void cuda_stream_destroy(shared_ptr stream); + +/** cudaEvent wrapper functions. +*/ +NBLA_CUDA_API shared_ptr cuda_create_event(int device_id = -1); +NBLA_CUDA_API void cuda_default_stream_event(shared_ptr event); +NBLA_CUDA_API void cuda_stream_wait_event(shared_ptr stream, + shared_ptr event); +NBLA_CUDA_API void cuda_event_synchronize(shared_ptr event); } #endif diff --git a/python/src/nnabla_ext/cuda/__init__.py b/python/src/nnabla_ext/cuda/__init__.py index 2560db6a2..9c9c917a8 100644 --- a/python/src/nnabla_ext/cuda/__init__.py +++ b/python/src/nnabla_ext/cuda/__init__.py @@ -30,7 +30,8 @@ array_classes, device_synchronize, get_device_count, - get_devices) + get_devices, + StreamEventHandler) except: print('Please install CUDA version {}.'.format(__cuda_version__)) print(' and CUDNN version {}.'.format(__cudnn_version__)) diff --git a/python/src/nnabla_ext/cuda/init.pyx b/python/src/nnabla_ext/cuda/init.pyx index 407a1a6a6..a50999791 100644 --- a/python/src/nnabla_ext/cuda/init.pyx +++ b/python/src/nnabla_ext/cuda/init.pyx @@ -18,6 +18,9 @@ from nnabla import add_available_context import nnabla._init as cpu_init from libcpp.vector cimport vector from libcpp.string cimport string +from libcpp.memory cimport shared_ptr +from libc.stdint cimport uintptr_t +from libcpp cimport bool cdef extern from "nbla/cuda/init.hpp" namespace "nbla": void init_cuda() except+ @@ -27,7 +30,17 @@ cdef extern from "nbla/cuda/init.hpp" namespace "nbla": void cuda_device_synchronize(const string & device) except + int cuda_get_device_count() except + vector[string] cuda_get_devices() except + - + shared_ptr[void] cuda_create_stream(int device_id) except + + void* cuda_stream_shared_to_void(shared_ptr[void]) except + + void print_stream_flag(shared_ptr[void]) except + + void print_stream_priority(shared_ptr[void]) except + + void cuda_stream_synchronize(shared_ptr[void]) nogil except + + void cuda_nullstream_synchronize() nogil except + + void cuda_stream_destroy(shared_ptr[void]) except + + shared_ptr[void] cuda_create_event(int device_id) except + + void cuda_default_stream_event(shared_ptr[void]) except + + void cuda_stream_wait_event(shared_ptr[void], shared_ptr[void]) except + + void cuda_event_synchronize(shared_ptr[void]) nogil except + logger.info('Initializing CUDA extension...') try: @@ -110,3 +123,54 @@ def get_devices(): """ return cuda_get_devices() ############################################################################### + +cdef class StreamEventHandler: + cdef shared_ptr[void] stream + cdef shared_ptr[void] event + cdef public object value + cdef public int device_id + cpdef bool is_stream_destroy + + def __cinit__(self, int device_id=-1): + self.is_stream_destroy = True + self.device_id = device_id + + def __init__(self, int device_id=-1): + self.stream_create(device_id) + self.event = cuda_create_event(device_id) + self.add_default_stream_event() + + def stream_wait_event(self): + if not self.is_stream_destroy: + cuda_stream_wait_event(self.stream, self.event) + + def add_default_stream_event(self): + cuda_default_stream_event(self.event) + + def event_synchronize(self): + with nogil: + cuda_event_synchronize(self.event) + + def stream_destroy(self): + cuda_stream_destroy(self.stream) + self.is_stream_destroy = True + + def stream_create(self, device_id): + if not self.is_stream_destroy: + self.stream_destroy() + + self.stream = cuda_create_stream(device_id) + + cdef void* stream_vp = cuda_stream_shared_to_void(self.stream) + self.value = stream_vp + + self.is_stream_destroy = False + + def stream_synchronize(self): + if not self.is_stream_destroy: + with nogil: + cuda_stream_synchronize(self.stream) + + def default_stream_synchronize(self): + with nogil: + cuda_nullstream_synchronize() diff --git a/src/nbla/cuda/init.cpp.tmpl b/src/nbla/cuda/init.cpp.tmpl index e44f76b65..dea1a09e7 100644 --- a/src/nbla/cuda/init.cpp.tmpl +++ b/src/nbla/cuda/init.cpp.tmpl @@ -168,5 +168,97 @@ vector cuda_get_devices() { } return ret; } + +shared_ptr cuda_create_stream(int device_id) { + cuda_set_device(device_id); + + std::default_delete default_deleter; + auto deleter = [default_deleter](cudaStream_t* ptr) { + NBLA_CUDA_CHECK(cudaStreamDestroy(*ptr)); + + default_deleter(ptr); + }; + + auto stream = shared_ptr(new cudaStream_t(), deleter); + + NBLA_CUDA_CHECK(cudaStreamCreateWithFlags(stream.get(), cudaStreamNonBlocking)); + + return stream; +} + +void* cuda_stream_shared_to_void(shared_ptr stream) { + auto s = static_cast(stream.get()); + + return static_cast(*s); +} + +void print_stream_flag (shared_ptr stream) { + auto s = static_cast(stream.get()); + unsigned int flags; + + NBLA_CUDA_CHECK(cudaStreamGetFlags(*s, &flags)); + printf("flags: %u\n", flags); +} + +void print_stream_priority (shared_ptr stream) { + auto s = static_cast(stream.get()); + int p; + + NBLA_CUDA_CHECK(cudaStreamGetPriority(*s, &p)); + printf("priority: %d\n", p); +} + +void cuda_nullstream_synchronize() { + NBLA_CUDA_CHECK(cudaStreamSynchronize(0)); +} + +void cuda_stream_synchronize(shared_ptr stream) { + auto s = static_cast(stream.get()); + NBLA_CUDA_CHECK(cudaStreamSynchronize(*s)); +} + +void cuda_stream_destroy(shared_ptr stream) { + auto s = static_cast(stream.get()); + + NBLA_CUDA_CHECK(cudaStreamDestroy(*s)); +} + +std::shared_ptr cuda_create_event(int device_id) { + cuda_set_device(device_id); + + std::default_delete default_deleter; + auto deleter = [default_deleter](cudaEvent_t* ptr) { + NBLA_CUDA_CHECK(cudaEventDestroy(*ptr)); + + default_deleter(ptr); + }; + + auto event = shared_ptr(new cudaEvent_t(), deleter); + + NBLA_CUDA_CHECK(cudaEventCreateWithFlags(event.get(), cudaEventDisableTiming)); + + return event; +} + +void cuda_default_stream_event(shared_ptr event){ + auto e = static_cast(event.get()); + + NBLA_CUDA_CHECK(cudaEventRecord(*e)); + +} +void cuda_stream_wait_event(shared_ptr stream, shared_ptr event) { + auto s = static_cast(stream.get()); + auto e = static_cast(event.get()); + + NBLA_CUDA_CHECK(cudaStreamWaitEvent(*s, *e, 0)); + +} + +void cuda_event_synchronize(shared_ptr event) { + auto e = static_cast(event.get()); + + NBLA_CUDA_CHECK(cudaEventSynchronize(*e)); +} + }