From 8ba81e47ae8f5bdc2f68bbd7d6c520b2982651fa Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Tue, 10 May 2022 18:38:18 -0400 Subject: [PATCH 1/3] Use CUB's new CDP macros. --- dependencies/cub | 2 +- testing/cmake/check_source_files.cmake | 33 + thrust/system/cuda/config.h | 45 +- .../system/cuda/detail/adjacent_difference.h | 35 +- thrust/system/cuda/detail/cdp_dispatch.h | 72 ++ thrust/system/cuda/detail/copy.h | 45 +- thrust/system/cuda/detail/copy_if.h | 102 +-- .../system/cuda/detail/core/agent_launcher.h | 2 - .../cuda/detail/core/triple_chevron_launch.h | 864 +----------------- thrust/system/cuda/detail/core/util.h | 34 +- thrust/system/cuda/detail/extrema.h | 144 ++- thrust/system/cuda/detail/merge.h | 107 +-- thrust/system/cuda/detail/par_to_seq.h | 6 - thrust/system/cuda/detail/parallel_for.h | 33 +- thrust/system/cuda/detail/partition.h | 298 +++--- thrust/system/cuda/detail/reduce.h | 46 +- thrust/system/cuda/detail/reduce_by_key.h | 69 +- thrust/system/cuda/detail/scan.h | 69 +- thrust/system/cuda/detail/scan_by_key.h | 86 +- thrust/system/cuda/detail/set_operations.h | 463 ++++------ thrust/system/cuda/detail/sort.h | 120 +-- thrust/system/cuda/detail/unique.h | 66 +- thrust/system/cuda/detail/unique_by_key.h | 93 +- thrust/system/cuda/detail/util.h | 12 +- 24 files changed, 941 insertions(+), 1905 deletions(-) create mode 100644 thrust/system/cuda/detail/cdp_dispatch.h diff --git a/dependencies/cub b/dependencies/cub index 29b030482..a634b91cb 160000 --- a/dependencies/cub +++ b/dependencies/cub @@ -1 +1 @@ -Subproject commit 29b0304823b67369dca093b7cb0658892e001780 +Subproject commit a634b91cb964682b26be660af6a515aa8955f85d diff --git a/testing/cmake/check_source_files.cmake b/testing/cmake/check_source_files.cmake index 866f5e7db..900300c67 100644 --- a/testing/cmake/check_source_files.cmake +++ b/testing/cmake/check_source_files.cmake @@ -84,6 +84,24 @@ if (NOT valid_count EQUAL 5) "Matched ${valid_count} times, expected 5.") endif() +################################################################################ +# Legacy macro checks. +# Check all files in Thrust to make sure that they aren't using the legacy +# CUB_RUNTIME_ENABLED and __THRUST_HAS_CUDART__ macros. +# +# These macros depend on __CUDA_ARCH__ and are not compatible with NV_IF_TARGET. +# They are provided for legacy purposes and should be replaced with +# [THRUST|CUB]_RDC_ENABLED and NV_IF_TARGET in Thrust/CUB code. +# +# +set(legacy_macro_header_exclusions + # This header defines a legacy CUDART macro: + thrust/system/cuda/config.h +) + +set(cub_legacy_macro_regex "CUB_RUNTIME_ENABLED") +set(thrust_legacy_macro_regex "__THRUST_HAS_CUDART__") + ################################################################################ # Read source files: foreach(src ${thrust_srcs}) @@ -145,6 +163,21 @@ foreach(src ${thrust_srcs}) set(found_errors 1) endif() endif() + + if (NOT ${src} IN_LIST legacy_macro_header_exclusions) + count_substrings("${src_contents}" "${thrust_legacy_macro_regex}" thrust_count) + count_substrings("${src_contents}" "${cub_legacy_macro_regex}" cub_count) + + if (NOT thrust_count EQUAL 0) + message("'${src}' uses __THRUST_HAS_CUDART__. Replace with THRUST_RDC_ENABLED and NV_IF_TARGET.") + set(found_errors 1) + endif() + + if (NOT cub_count EQUAL 0) + message("'${src}' uses CUB_RUNTIME_ENABLED. Replace with CUB_RDC_ENABLED and NV_IF_TARGET.") + set(found_errors 1) + endif() + endif() endforeach() if (NOT found_errors EQUAL 0) diff --git a/thrust/system/cuda/config.h b/thrust/system/cuda/config.h index 251f8d180..654347c29 100644 --- a/thrust/system/cuda/config.h +++ b/thrust/system/cuda/config.h @@ -32,22 +32,47 @@ // older releases. This header will always pull in version info: #include -#if defined(__CUDACC__) || defined(_NVHPC_CUDA) -# if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__) -# define __THRUST_HAS_CUDART__ 1 -# define THRUST_RUNTIME_FUNCTION __host__ __device__ __forceinline__ -# else -# define __THRUST_HAS_CUDART__ 0 -# define THRUST_RUNTIME_FUNCTION __host__ __forceinline__ -# endif +#include + +/** + * \def THRUST_RUNTIME_FUNCTION + * + * Execution space for functions that can use the CUDA runtime API (`__host__` + * when RDC is off, `__host__ __device__` when RDC is on). + */ +#define THRUST_RUNTIME_FUNCTION CUB_RUNTIME_FUNCTION + +/** + * \def THRUST_RDC_ENABLED + * + * Defined if RDC is enabled. + */ +#ifdef CUB_RDC_ENABLED +#define THRUST_RDC_ENABLED +#endif + +/** + * \def __THRUST_HAS_CUDART__ + * + * Whether or not the active compiler pass is allowed to invoke device kernels + * or methods from the CUDA runtime API. + * + * This macro should not be used in Thrust, as it depends on `__CUDA_ARCH__` + * and is not compatible with `NV_IF_TARGET`. It is provided for legacy + * purposes only. + * + * Replace any usages with `THRUST_RDC_ENABLED` and `NV_IF_TARGET`. + */ +#ifdef CUB_RUNTIME_ENABLED +#define __THRUST_HAS_CUDART__ 1 #else -# define __THRUST_HAS_CUDART__ 0 -# define THRUST_RUNTIME_FUNCTION __host__ __forceinline__ +#define __THRUST_HAS_CUDART__ 0 #endif // These definitions were intended for internal use only and are now obsolete. // If you relied on them, consider porting your code to use the functionality // in libcu++'s header. +// // For a temporary workaround, define THRUST_PROVIDE_LEGACY_ARCH_MACROS to make // them available again. These should be considered deprecated and will be // fully removed in a future version. diff --git a/thrust/system/cuda/detail/adjacent_difference.h b/thrust/system/cuda/detail/adjacent_difference.h index 38f19fa66..0a1b9f0e3 100644 --- a/thrust/system/cuda/detail/adjacent_difference.h +++ b/thrust/system/cuda/detail/adjacent_difference.h @@ -29,12 +29,14 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + #include #include #include #include #include #include +#include #include #include #include @@ -260,27 +262,18 @@ adjacent_difference(execution_policy &policy, OutputIt result, BinaryOp binary_op) { - OutputIt ret = result; - if (__THRUST_HAS_CUDART__) - { - ret = __adjacent_difference::adjacent_difference(policy, - first, - last, - result, - binary_op); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::adjacent_difference(cvt_to_seq(derived_cast(policy)), - first, - last, - result, - binary_op); -#endif - } - - return ret; + THRUST_CDP_DISPATCH( + (result = __adjacent_difference::adjacent_difference(policy, + first, + last, + result, + binary_op);), + (result = thrust::adjacent_difference(cvt_to_seq(derived_cast(policy)), + first, + last, + result, + binary_op);)); + return result; } template +#include + +#include + +/** + * \def THRUST_CDP_DISPATCH + * + * If CUDA Dynamic Parallelism / CUDA Nested Parallelism is available, always + * run the parallel implementation. Otherwise, run the parallel implementation + * when called from the host, and fallback to the sequential implementation on + * the device. + * + * `par_impl` and `seq_impl` are blocks of C++ statements enclosed in + * parentheses, similar to NV_IF_TARGET blocks: + * + * \code + * THRUST_CDP_DISPATCH((launch_parallel_kernel();), (run_serial_impl();)); + * \endcode + */ + +#ifdef THRUST_RDC_ENABLED + +// seq_impl unused. +#define THRUST_CDP_DISPATCH(par_impl, seq_impl) \ + NV_IF_TARGET(NV_ANY_TARGET, par_impl) + +#else // THRUST_RDC_ENABLED + +// Special case for NVCC -- need to inform the device path about the kernels +// that are launched from the host path. +#if defined(__CUDACC__) && defined(__CUDA_ARCH__) + +// Device-side launch not supported, fallback to sequential in device code. +#define THRUST_CDP_DISPATCH(par_impl, seq_impl) \ + if (false) \ + { /* Without this, the device pass won't compile any kernels. */ \ + NV_IF_TARGET(NV_ANY_TARGET, par_impl); \ + } \ + NV_IF_TARGET(NV_IS_HOST, par_impl, seq_impl) + +#else // NVCC device pass + +#define THRUST_CDP_DISPATCH(par_impl, seq_impl) \ + NV_IF_TARGET(NV_IS_HOST, par_impl, seq_impl) + +#endif // NVCC device pass + +#endif // THRUST_RDC_ENABLED diff --git a/thrust/system/cuda/detail/copy.h b/thrust/system/cuda/detail/copy.h index 949fe9b2a..02a5d2ac1 100644 --- a/thrust/system/cuda/detail/copy.h +++ b/thrust/system/cuda/detail/copy.h @@ -28,7 +28,10 @@ #include +#include + #include +#include #include #include @@ -117,22 +120,11 @@ copy(execution_policy &system, InputIterator last, OutputIterator result) { - OutputIterator ret = result; - if (__THRUST_HAS_CUDART__) - { - ret = __copy::device_to_device(system, first, last, result); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::copy(cvt_to_seq(derived_cast(system)), - first, - last, - result); -#endif - } - - return ret; + THRUST_CDP_DISPATCH( + (result = __copy::device_to_device(system, first, last, result);), + (result = + thrust::copy(cvt_to_seq(derived_cast(system)), first, last, result);)); + return result; } // end copy() __thrust_exec_check_disable__ @@ -146,19 +138,14 @@ copy_n(execution_policy &system, Size n, OutputIterator result) { - OutputIterator ret = result; - if (__THRUST_HAS_CUDART__) - { - ret = __copy::device_to_device(system, first, first + n, result); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::copy_n(cvt_to_seq(derived_cast(system)), first, n, result); -#endif - } - - return ret; + THRUST_CDP_DISPATCH( + (result = __copy::device_to_device(system, + first, + thrust::next(first, n), + result);), + (result = + thrust::copy_n(cvt_to_seq(derived_cast(system)), first, n, result);)); + return result; } // end copy_n() #endif diff --git a/thrust/system/cuda/detail/copy_if.h b/thrust/system/cuda/detail/copy_if.h index cd20b296a..1800dae87 100644 --- a/thrust/system/cuda/detail/copy_if.h +++ b/thrust/system/cuda/detail/copy_if.h @@ -29,19 +29,20 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include +#include #include +#include #include -#include -#include +#include +#include +#include #include #include #include -#include -#include -#include +#include +#include #include THRUST_NAMESPACE_BEGIN @@ -598,17 +599,17 @@ namespace __copy_if { class Predicate, class Size, class NumSelectedOutIt> - static cudaError_t THRUST_RUNTIME_FUNCTION - doit_step(void * d_temp_storage, - size_t & temp_storage_bytes, - ItemsIt items, - StencilIt stencil, - OutputIt output_it, - Predicate predicate, - NumSelectedOutIt num_selected_out, - Size num_items, - cudaStream_t stream, - bool debug_sync) + THRUST_RUNTIME_FUNCTION + static cudaError_t doit_step(void * d_temp_storage, + size_t & temp_storage_bytes, + ItemsIt items, + StencilIt stencil, + OutputIt output_it, + Predicate predicate, + NumSelectedOutIt num_selected_out, + Size num_items, + cudaStream_t stream, + bool debug_sync) { if (num_items == 0) return cudaSuccess; @@ -789,28 +790,19 @@ copy_if(execution_policy &policy, OutputIterator result, Predicate pred) { - OutputIterator ret = result; - - if (__THRUST_HAS_CUDART__) - { - ret = __copy_if::copy_if(policy, - first, - last, - __copy_if::no_stencil_tag(), - result, - pred); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::copy_if(cvt_to_seq(derived_cast(policy)), - first, - last, - result, - pred); -#endif - } - return ret; + THRUST_CDP_DISPATCH((result = __copy_if::copy_if(policy, + first, + last, + __copy_if::no_stencil_tag(), + result, + pred);), + (result = + thrust::copy_if(cvt_to_seq(derived_cast(policy)), + first, + last, + result, + pred);)); + return result; } // func copy_if __thrust_exec_check_disable__ @@ -827,29 +819,15 @@ copy_if(execution_policy &policy, OutputIterator result, Predicate pred) { - OutputIterator ret = result; - - if (__THRUST_HAS_CUDART__) - { - ret = __copy_if::copy_if(policy, - first, - last, - stencil, - result, - pred); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::copy_if(cvt_to_seq(derived_cast(policy)), - first, - last, - stencil, - result, - pred); -#endif - } - return ret; + THRUST_CDP_DISPATCH( + (result = __copy_if::copy_if(policy, first, last, stencil, result, pred);), + (result = thrust::copy_if(cvt_to_seq(derived_cast(policy)), + first, + last, + stencil, + result, + pred);)); + return result; } // func copy_if } // namespace cuda_cub diff --git a/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/system/cuda/detail/core/agent_launcher.h index 4cdd7ff46..b604f293e 100644 --- a/thrust/system/cuda/detail/core/agent_launcher.h +++ b/thrust/system/cuda/detail/core/agent_launcher.h @@ -536,8 +536,6 @@ namespace core { return max_blocks_per_sm_impl(k, plan.block_threads); } - - template THRUST_RUNTIME_FUNCTION void print_info(K k) const diff --git a/thrust/system/cuda/detail/core/triple_chevron_launch.h b/thrust/system/cuda/detail/core/triple_chevron_launch.h index bf9955c6d..aeae83a32 100644 --- a/thrust/system/cuda/detail/core/triple_chevron_launch.h +++ b/thrust/system/cuda/detail/core/triple_chevron_launch.h @@ -55,7 +55,6 @@ namespace launcher { shared_mem(shared_mem_), stream(stream_) {} -#if 0 template cudaError_t __host__ doit_host(K k, Args const&... args) const @@ -63,120 +62,6 @@ namespace launcher { k<<>>(args...); return cudaPeekAtLastError(); } -#else - template - cudaError_t __host__ - doit_host(K k, _0 x0) const - { - k<<>>(x0); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1) const - { - k<<>>(x0,x1); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2) const - { - k<<>>(x0,x1,x2); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3) const - { - k<<>>(x0,x1,x2,x3); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const - { - k<<>>(x0,x1,x2,x3,x4); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const - { - k<<>>(x0,x1,x2,x3,x4,x5); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const - { - k<<>>(x0,x1,x2,x3,x4,x5,x6); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const - { - k<<>>(x0,x1,x2,x3,x4,x5,x6,x7); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const - { - k<<>>(x0,x1,x2,x3,x4,x5,x6,x7,x8); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const - { - k<<>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const - { - k<<>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const - { - k<<>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const - { - k<<>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD) const - { - k<<>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE) const - { - k<<>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE, _xF xF) const - { - k<<>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE,xF); - return cudaPeekAtLastError(); - } -#endif template size_t __device__ @@ -186,7 +71,6 @@ namespace launcher { return alignment * ((offset + (alignment - 1))/ alignment); } -#if 0 size_t __device__ argument_pack_size(size_t size) const { return size; } template size_t __device__ @@ -195,110 +79,6 @@ namespace launcher { size = align_up(size); return argument_pack_size(size + sizeof(Arg), args...); } -#else - template - size_t __device__ - argument_pack_size(size_t size, Arg) const - { - return align_up(size) + sizeof(Arg); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE, _xF xF) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE, xF); - } -#endif /* variadic */ template size_t __device__ copy_arg(char* buffer, size_t offset, Arg arg) const @@ -309,664 +89,52 @@ namespace launcher { return offset + sizeof(Arg); } -#if 0 - void __device__ fill_arguments(char*, size_t) const {} + __device__ + void fill_arguments(char*, size_t) const + {} + template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg const& arg, Args const& ... args) const + __device__ + void fill_arguments(char* buffer, + size_t offset, + Arg const& arg, + Args const& ... args) const { fill_arguments(buffer, copy_arg(buffer, offset, arg), args...); } -#else - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg) const - { - copy_arg(buffer, offset, arg); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE, _xF xF) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE, xF); - } -#endif /* variadic */ -#if 0 template cudaError_t __device__ doit_device(K k, Args const&... args) const { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ const size_t size = argument_pack_size(0,args...); void *param_buffer = cudaGetParameterBuffer(64,size); fill_arguments((char*)param_buffer, 0, args...); - status = launch_device(k, param_buffer); -#endif - return status; - } -#else - template - cudaError_t __device__ - doit_device(K k, _0 x0) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); -#endif - return status; + return launch_device(k, param_buffer); } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); - THRUST_UNUSED_VAR(x6); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); - THRUST_UNUSED_VAR(x6); - THRUST_UNUSED_VAR(x7); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); - THRUST_UNUSED_VAR(x6); - THRUST_UNUSED_VAR(x7); - THRUST_UNUSED_VAR(x8); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); - THRUST_UNUSED_VAR(x6); - THRUST_UNUSED_VAR(x7); - THRUST_UNUSED_VAR(x8); - THRUST_UNUSED_VAR(x9); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); - THRUST_UNUSED_VAR(x6); - THRUST_UNUSED_VAR(x7); - THRUST_UNUSED_VAR(x8); - THRUST_UNUSED_VAR(x9); - THRUST_UNUSED_VAR(xA); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); - THRUST_UNUSED_VAR(x6); - THRUST_UNUSED_VAR(x7); - THRUST_UNUSED_VAR(x8); - THRUST_UNUSED_VAR(x9); - THRUST_UNUSED_VAR(xA); - THRUST_UNUSED_VAR(xB); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); - THRUST_UNUSED_VAR(x6); - THRUST_UNUSED_VAR(x7); - THRUST_UNUSED_VAR(x8); - THRUST_UNUSED_VAR(x9); - THRUST_UNUSED_VAR(xA); - THRUST_UNUSED_VAR(xB); - THRUST_UNUSED_VAR(xC); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); - THRUST_UNUSED_VAR(x6); - THRUST_UNUSED_VAR(x7); - THRUST_UNUSED_VAR(x8); - THRUST_UNUSED_VAR(x9); - THRUST_UNUSED_VAR(xA); - THRUST_UNUSED_VAR(xB); - THRUST_UNUSED_VAR(xC); - THRUST_UNUSED_VAR(xD); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); - THRUST_UNUSED_VAR(x6); - THRUST_UNUSED_VAR(x7); - THRUST_UNUSED_VAR(x8); - THRUST_UNUSED_VAR(x9); - THRUST_UNUSED_VAR(xA); - THRUST_UNUSED_VAR(xB); - THRUST_UNUSED_VAR(xC); - THRUST_UNUSED_VAR(xD); - THRUST_UNUSED_VAR(xE); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE, _xF xF) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE,xF); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE,xF); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); - THRUST_UNUSED_VAR(x6); - THRUST_UNUSED_VAR(x7); - THRUST_UNUSED_VAR(x8); - THRUST_UNUSED_VAR(x9); - THRUST_UNUSED_VAR(xA); - THRUST_UNUSED_VAR(xB); - THRUST_UNUSED_VAR(xC); - THRUST_UNUSED_VAR(xD); - THRUST_UNUSED_VAR(xE); - THRUST_UNUSED_VAR(xF); -#endif - return status; - } -#endif /* variadic */ template cudaError_t __device__ launch_device(K k, void* buffer) const { -#if __THRUST_HAS_CUDART__ return cudaLaunchDevice((void*)k, buffer, dim3(grid), dim3(block), shared_mem, stream); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(buffer); - return cudaErrorNotSupported; -#endif } - -#if defined(_NVHPC_CUDA) -# define THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(...) \ - (__builtin_is_device_code() ? \ - doit_device(__VA_ARGS__) : doit_host(__VA_ARGS__)) -#elif defined(__CUDA_ARCH__) -# define THRUST_TRIPLE_LAUNCHER_HOSTDEVICE doit_device -#else -# define THRUST_TRIPLE_LAUNCHER_HOSTDEVICE doit_host -#endif - -#if 0 __thrust_exec_check_disable__ template - cudaError_t THRUST_FUNCTION - doit(K k, Args const&... args) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, args...); - } -#else - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const + THRUST_FUNCTION + cudaError_t doit(K k, Args const&... args) const { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA); + NV_IF_TARGET(NV_IS_HOST, + (return doit_host(k, args...);), + (return doit_device(k, args...);)); } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE, _xF xF) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE, xF); - } -#endif -#undef THRUST_TRIPLE_LAUNCHER_HOSTDEVICE + }; // struct triple_chevron } // namespace launcher diff --git a/thrust/system/cuda/detail/core/util.h b/thrust/system/cuda/detail/core/util.h index 4e014ccc6..11efc0858 100644 --- a/thrust/system/cuda/detail/core/util.h +++ b/thrust/system/cuda/detail/core/util.h @@ -352,14 +352,9 @@ namespace core { }; template - typename get_plan::type THRUST_RUNTIME_FUNCTION - get_agent_plan(int ptx_version) + THRUST_RUNTIME_FUNCTION + typename get_plan::type get_agent_plan(int ptx_version) { - // Use one path, with Agent::ptx_plan, for device code where device-side - // kernel launches are supported. The other path, with - // get_agent_plan_impl::get(version), is for host code and for device - // code without device-side kernel launches. -#ifdef __THRUST_HAS_CUDART__ NV_IF_TARGET( NV_IS_DEVICE, ( @@ -369,9 +364,6 @@ namespace core { return plan_type{ptx_plan{}}; ), // NV_IS_HOST: ( return get_agent_plan_impl::get(ptx_version); )); -#else - return get_agent_plan_impl::get(ptx_version); -#endif } // XXX keep this dead-code for now as a gentle reminder @@ -456,7 +448,7 @@ namespace core { ///////////////////////// THRUST_RUNTIME_FUNCTION - int get_sm_count() + inline int get_sm_count() { int dev_id; cuda_cub::throw_on_error(cudaGetDevice(&dev_id), @@ -474,8 +466,8 @@ namespace core { return i32value; } - size_t THRUST_RUNTIME_FUNCTION - get_max_shared_memory_per_block() + THRUST_RUNTIME_FUNCTION + inline size_t get_max_shared_memory_per_block() { int dev_id; cuda_cub::throw_on_error(cudaGetDevice(&dev_id), @@ -494,8 +486,8 @@ namespace core { return static_cast(i32value); } - size_t THRUST_RUNTIME_FUNCTION - virtual_shmem_size(size_t shmem_per_block) + THRUST_RUNTIME_FUNCTION + inline size_t virtual_shmem_size(size_t shmem_per_block) { size_t max_shmem_per_block = core::get_max_shared_memory_per_block(); if (shmem_per_block > max_shmem_per_block) @@ -504,8 +496,8 @@ namespace core { return 0; } - size_t THRUST_RUNTIME_FUNCTION - vshmem_size(size_t shmem_per_block, size_t num_blocks) + THRUST_RUNTIME_FUNCTION + inline size_t vshmem_size(size_t shmem_per_block, size_t num_blocks) { size_t max_shmem_per_block = core::get_max_shared_memory_per_block(); if (shmem_per_block > max_shmem_per_block) @@ -622,16 +614,16 @@ namespace core { __host__ __device__ operator T const &() const { return value_; } }; - cuda_optional THRUST_RUNTIME_FUNCTION - get_ptx_version() + THRUST_RUNTIME_FUNCTION + inline cuda_optional get_ptx_version() { int ptx_version = 0; cudaError_t status = cub::PtxVersion(ptx_version); return cuda_optional(ptx_version, status); } - cudaError_t THRUST_RUNTIME_FUNCTION - sync_stream(cudaStream_t stream) + THRUST_RUNTIME_FUNCTION + inline cudaError_t sync_stream(cudaStream_t stream) { return cub::SyncStream(stream); } diff --git a/thrust/system/cuda/detail/extrema.h b/thrust/system/cuda/detail/extrema.h index 0519b7df3..5ceda54f3 100644 --- a/thrust/system/cuda/detail/extrema.h +++ b/thrust/system/cuda/detail/extrema.h @@ -29,14 +29,15 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include -#include #include #include +#include #include #include -#include +#include +#include +#include #include @@ -421,24 +422,16 @@ min_element(execution_policy &policy, ItemsIt last, BinaryPred binary_pred) { - ItemsIt ret = first; - if (__THRUST_HAS_CUDART__) - { - ret = __extrema::element<__extrema::arg_min_f>(policy, - first, - last, - binary_pred); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::min_element(cvt_to_seq(derived_cast(policy)), - first, - last, - binary_pred); -#endif - } - return ret; + THRUST_CDP_DISPATCH( + (last = __extrema::element<__extrema::arg_min_f>(policy, + first, + last, + binary_pred);), + (last = thrust::min_element(cvt_to_seq(derived_cast(policy)), + first, + last, + binary_pred);)); + return last; } template &policy, ItemsIt last, BinaryPred binary_pred) { - ItemsIt ret = first; - if (__THRUST_HAS_CUDART__) - { - ret = __extrema::element<__extrema::arg_max_f>(policy, - first, - last, - binary_pred); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::max_element(cvt_to_seq(derived_cast(policy)), - first, - last, - binary_pred); -#endif - } - return ret; + THRUST_CDP_DISPATCH( + (last = __extrema::element<__extrema::arg_max_f>(policy, + first, + last, + binary_pred);), + (last = thrust::max_element(cvt_to_seq(derived_cast(policy)), + first, + last, + binary_pred);)); + return last; } template &policy, ItemsIt last, BinaryPred binary_pred) { - pair ret = thrust::make_pair(first, first); - - if (__THRUST_HAS_CUDART__) + auto ret = thrust::make_pair(last, last); + if (first == last) { - if (first == last) - return thrust::make_pair(last, last); - - typedef typename iterator_traits::value_type InputType; - typedef typename iterator_traits::difference_type IndexType; - - IndexType num_items = static_cast(thrust::distance(first, last)); - - - typedef tuple > iterator_tuple; - typedef zip_iterator zip_iterator; - - iterator_tuple iter_tuple = thrust::make_tuple(first, counting_iterator_t(0)); - - - typedef __extrema::arg_minmax_f arg_minmax_t; - typedef typename arg_minmax_t::two_pairs_type two_pairs_type; - typedef typename arg_minmax_t::duplicate_tuple duplicate_t; - typedef transform_input_iterator_t - transform_t; - - zip_iterator begin = make_zip_iterator(iter_tuple); - two_pairs_type result = __extrema::extrema(policy, - transform_t(begin, duplicate_t()), - num_items, - arg_minmax_t(binary_pred), - (two_pairs_type *)(NULL)); - ret = thrust::make_pair(first + get<1>(get<0>(result)), - first + get<1>(get<1>(result))); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::minmax_element(cvt_to_seq(derived_cast(policy)), - first, - last, - binary_pred); -#endif + return ret; } + + THRUST_CDP_DISPATCH( + (using InputType = typename iterator_traits::value_type; + using IndexType = typename iterator_traits::difference_type; + + const auto num_items = + static_cast(thrust::distance(first, last)); + + using iterator_tuple = tuple>; + using zip_iterator = zip_iterator; + + iterator_tuple iter_tuple = + thrust::make_tuple(first, counting_iterator_t(0)); + + using arg_minmax_t = + __extrema::arg_minmax_f; + using two_pairs_type = typename arg_minmax_t::two_pairs_type; + using duplicate_t = typename arg_minmax_t::duplicate_tuple; + using transform_t = + transform_input_iterator_t; + + zip_iterator begin = make_zip_iterator(iter_tuple); + two_pairs_type result = + __extrema::extrema(policy, + transform_t(begin, duplicate_t()), + num_items, + arg_minmax_t(binary_pred), + (two_pairs_type *)(NULL)); + ret = thrust::make_pair(first + get<1>(get<0>(result)), + first + get<1>(get<1>(result)));), + // CDP Sequential impl: + (ret = thrust::minmax_element(cvt_to_seq(derived_cast(policy)), + first, + last, + binary_pred);)); return ret; } diff --git a/thrust/system/cuda/detail/merge.h b/thrust/system/cuda/detail/merge.h index b8b17012b..1e4bfa384 100644 --- a/thrust/system/cuda/detail/merge.h +++ b/thrust/system/cuda/detail/merge.h @@ -29,20 +29,20 @@ j * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + #include #include -#include - -#include -#include +#include +#include +#include +#include +#include +#include #include #include +#include +#include #include -#include -#include -#include -#include -#include THRUST_NAMESPACE_BEGIN @@ -876,38 +876,28 @@ merge(execution_policy& policy, CompareOp compare_op) { - ResultIt ret = result; - if (__THRUST_HAS_CUDART__) - { - typedef typename thrust::iterator_value::type keys_type; - // - keys_type* null_ = NULL; - // - ret = __merge::merge(policy, - keys1_first, - keys1_last, - keys2_first, - keys2_last, - null_, - null_, - result, - null_, - compare_op) - .first; - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::merge(cvt_to_seq(derived_cast(policy)), - keys1_first, - keys1_last, - keys2_first, - keys2_last, - result, - compare_op); -#endif - } - return ret; + THRUST_CDP_DISPATCH((using keys_type = thrust::iterator_value_t; + keys_type *null_ = nullptr; + auto tmp = + __merge::merge(policy, + keys1_first, + keys1_last, + keys2_first, + keys2_last, + null_, + null_, + result, + null_, + compare_op); + result = tmp.first;), + (result = thrust::merge(cvt_to_seq(derived_cast(policy)), + keys1_first, + keys1_last, + keys2_first, + keys2_last, + result, + compare_op);)); + return result; } template @@ -950,10 +940,9 @@ merge_by_key(execution_policy &policy, ItemsOutputIt items_result, CompareOp compare_op) { - pair ret = thrust::make_pair(keys_result, items_result); - if (__THRUST_HAS_CUDART__) - { - return __merge::merge(policy, + auto ret = thrust::make_pair(keys_result, items_result); + THRUST_CDP_DISPATCH( + (ret = __merge::merge(policy, keys1_first, keys1_last, keys2_first, @@ -962,23 +951,17 @@ merge_by_key(execution_policy &policy, items2_first, keys_result, items_result, - compare_op); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::merge_by_key(cvt_to_seq(derived_cast(policy)), - keys1_first, - keys1_last, - keys2_first, - keys2_last, - items1_first, - items2_first, - keys_result, - items_result, - compare_op); -#endif - } + compare_op);), + (ret = thrust::merge_by_key(cvt_to_seq(derived_cast(policy)), + keys1_first, + keys1_last, + keys2_first, + keys2_last, + items1_first, + items2_first, + keys_result, + items_result, + compare_op);)); return ret; } diff --git a/thrust/system/cuda/detail/par_to_seq.h b/thrust/system/cuda/detail/par_to_seq.h index 833634982..e710f017b 100644 --- a/thrust/system/cuda/detail/par_to_seq.h +++ b/thrust/system/cuda/detail/par_to_seq.h @@ -82,11 +82,5 @@ cvt_to_seq(Policy& policy) return cvt_to_seq_impl::doit(policy); } -#if __THRUST_HAS_CUDART__ -#define THRUST_CUDART_DISPATCH par -#else -#define THRUST_CUDART_DISPATCH seq -#endif - } // namespace cuda_ THRUST_NAMESPACE_END diff --git a/thrust/system/cuda/detail/parallel_for.h b/thrust/system/cuda/detail/parallel_for.h index be4ff14a5..3e36affef 100644 --- a/thrust/system/cuda/detail/parallel_for.h +++ b/thrust/system/cuda/detail/parallel_for.h @@ -29,13 +29,13 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include -#include #include -#include +#include +#include #include #include +#include THRUST_NAMESPACE_BEGIN @@ -155,21 +155,22 @@ parallel_for(execution_policy &policy, Size count) { if (count == 0) - return; - - if (__THRUST_HAS_CUDART__) - { - cudaStream_t stream = cuda_cub::stream(policy); - cudaError_t status = __parallel_for::parallel_for(count, f, stream); - cuda_cub::throw_on_error(status, "parallel_for failed"); - } - else { -#if !__THRUST_HAS_CUDART__ - for (Size idx = 0; idx != count; ++idx) - f(idx); -#endif + return; } + + // clang-format off + THRUST_CDP_DISPATCH( + (cudaStream_t stream = cuda_cub::stream(policy); + cudaError_t status = __parallel_for::parallel_for(count, f, stream); + cuda_cub::throw_on_error(status, "parallel_for failed");), + // CDP sequential impl: + (for (Size idx = 0; idx != count; ++idx) + { + f(idx); + } + )); + // clang-format on } } // namespace cuda_cub diff --git a/thrust/system/cuda/detail/partition.h b/thrust/system/cuda/detail/partition.h index 85d9bb813..b6df7b2b2 100644 --- a/thrust/system/cuda/detail/partition.h +++ b/thrust/system/cuda/detail/partition.h @@ -29,21 +29,25 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include #include #include -#include -#include +#include +#include +#include +#include +#include +#include #include +#include #include -#include -#include #include -#include -#include -#include +#include +#include // cub::ScanTileState +#include +#include +#include #include THRUST_NAMESPACE_BEGIN @@ -846,29 +850,22 @@ partition_copy(execution_policy &policy, RejectedOutIt rejected_result, Predicate predicate) { - pair ret = thrust::make_pair(selected_result, rejected_result); - if (__THRUST_HAS_CUDART__) - { - ret = __partition::partition(policy, - first, - last, - stencil, - selected_result, - rejected_result, - predicate); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::partition_copy(cvt_to_seq(derived_cast(policy)), - first, - last, - stencil, - selected_result, - rejected_result, - predicate); -#endif - } + auto ret = thrust::make_pair(selected_result, rejected_result); + THRUST_CDP_DISPATCH( + (ret = __partition::partition(policy, + first, + last, + stencil, + selected_result, + rejected_result, + predicate);), + (ret = thrust::partition_copy(cvt_to_seq(derived_cast(policy)), + first, + last, + stencil, + selected_result, + rejected_result, + predicate);)); return ret; } @@ -886,28 +883,21 @@ partition_copy(execution_policy &policy, RejectedOutIt rejected_result, Predicate predicate) { - pair ret = thrust::make_pair(selected_result, rejected_result); - if (__THRUST_HAS_CUDART__) - { - ret = __partition::partition(policy, - first, - last, - __partition::no_stencil_tag(), - selected_result, - rejected_result, - predicate); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::partition_copy(cvt_to_seq(derived_cast(policy)), - first, - last, - selected_result, - rejected_result, - predicate); -#endif - } + auto ret = thrust::make_pair(selected_result, rejected_result); + THRUST_CDP_DISPATCH( + (ret = __partition::partition(policy, + first, + last, + __partition::no_stencil_tag(), + selected_result, + rejected_result, + predicate);), + (ret = thrust::partition_copy(cvt_to_seq(derived_cast(policy)), + first, + last, + selected_result, + rejected_result, + predicate);)); return ret; } @@ -925,28 +915,21 @@ stable_partition_copy(execution_policy &policy, RejectedOutIt rejected_result, Predicate predicate) { - pair ret = thrust::make_pair(selected_result, rejected_result); - if (__THRUST_HAS_CUDART__) - { - ret = __partition::partition(policy, - first, - last, - __partition::no_stencil_tag(), - selected_result, - rejected_result, - predicate); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::stable_partition_copy(cvt_to_seq(derived_cast(policy)), - first, - last, - selected_result, - rejected_result, - predicate); -#endif - } + auto ret = thrust::make_pair(selected_result, rejected_result); + THRUST_CDP_DISPATCH( + (ret = __partition::partition(policy, + first, + last, + __partition::no_stencil_tag(), + selected_result, + rejected_result, + predicate);), + (ret = thrust::stable_partition_copy(cvt_to_seq(derived_cast(policy)), + first, + last, + selected_result, + rejected_result, + predicate);)); return ret; } @@ -966,29 +949,22 @@ stable_partition_copy(execution_policy &policy, RejectedOutIt rejected_result, Predicate predicate) { - pair ret = thrust::make_pair(selected_result, rejected_result); - if (__THRUST_HAS_CUDART__) - { - ret = __partition::partition(policy, - first, - last, - stencil, - selected_result, - rejected_result, - predicate); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::stable_partition_copy(cvt_to_seq(derived_cast(policy)), - first, - last, - stencil, - selected_result, - rejected_result, - predicate); -#endif - } + auto ret = thrust::make_pair(selected_result, rejected_result); + THRUST_CDP_DISPATCH( + (ret = __partition::partition(policy, + first, + last, + stencil, + selected_result, + rejected_result, + predicate);), + (ret = thrust::stable_partition_copy(cvt_to_seq(derived_cast(policy)), + first, + last, + stencil, + selected_result, + rejected_result, + predicate);)); return ret; } @@ -1006,22 +982,15 @@ partition(execution_policy &policy, StencilIt stencil, Predicate predicate) { - Iterator ret = first; - if (__THRUST_HAS_CUDART__) - { - ret = __partition::partition_inplace(policy, first, last, stencil, predicate); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::partition(cvt_to_seq(derived_cast(policy)), - first, - last, - stencil, - predicate); -#endif - } - return ret; + THRUST_CDP_DISPATCH( + (last = + __partition::partition_inplace(policy, first, last, stencil, predicate);), + (last = thrust::partition(cvt_to_seq(derived_cast(policy)), + first, + last, + stencil, + predicate);)); + return last; } __thrust_exec_check_disable__ @@ -1034,25 +1003,17 @@ partition(execution_policy &policy, Iterator last, Predicate predicate) { - Iterator ret = first; - if (__THRUST_HAS_CUDART__) - { - ret = __partition::partition_inplace(policy, - first, - last, - __partition::no_stencil_tag(), - predicate); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::partition(cvt_to_seq(derived_cast(policy)), - first, - last, - predicate); -#endif - } - return ret; + THRUST_CDP_DISPATCH( + (last = __partition::partition_inplace(policy, + first, + last, + __partition::no_stencil_tag(), + predicate);), + (last = thrust::partition(cvt_to_seq(derived_cast(policy)), + first, + last, + predicate);)); + return last; } __thrust_exec_check_disable__ @@ -1067,30 +1028,20 @@ stable_partition(execution_policy &policy, StencilIt stencil, Predicate predicate) { - Iterator result = first; - if (__THRUST_HAS_CUDART__) - { - result = __partition::partition_inplace(policy, + auto ret = last; + THRUST_CDP_DISPATCH( + (ret = + __partition::partition_inplace(policy, first, last, stencil, predicate); + + /* partition returns rejected values in reverse order + so reverse the rejected elements to make it stable */ + cuda_cub::reverse(policy, ret, last);), + (ret = thrust::stable_partition(cvt_to_seq(derived_cast(policy)), first, last, stencil, - predicate); - - // partition returns rejected values in reverese order - // so reverse the rejected elements to make it stable - cuda_cub::reverse(policy, result, last); - } - else - { -#if !__THRUST_HAS_CUDART__ - result = thrust::stable_partition(cvt_to_seq(derived_cast(policy)), - first, - last, - stencil, - predicate); -#endif - } - return result; + predicate);)); + return ret; } __thrust_exec_check_disable__ @@ -1103,29 +1054,22 @@ stable_partition(execution_policy &policy, Iterator last, Predicate predicate) { - Iterator result = first; - if (__THRUST_HAS_CUDART__) - { - result = __partition::partition_inplace(policy, - first, - last, - __partition::no_stencil_tag(), - predicate); - - // partition returns rejected values in reverese order - // so reverse the rejected elements to make it stable - cuda_cub::reverse(policy, result, last); - } - else - { -#if !__THRUST_HAS_CUDART__ - result = thrust::stable_partition(cvt_to_seq(derived_cast(policy)), - first, - last, - predicate); -#endif - } - return result; + auto ret = last; + THRUST_CDP_DISPATCH( + (ret = __partition::partition_inplace(policy, + first, + last, + __partition::no_stencil_tag(), + predicate); + + /* partition returns rejected values in reverse order + so reverse the rejected elements to make it stable */ + cuda_cub::reverse(policy, ret, last);), + (ret = thrust::stable_partition(cvt_to_seq(derived_cast(policy)), + first, + last, + predicate);)); + return ret; } template #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include +#include #include -#include -#include +#include #include +#include #include -#include -#include -#include -#include -#include +#include #include +#include +#include #include -#include -#include -#include +#include +#include +#include +#include +#include +#include #include THRUST_NAMESPACE_BEGIN @@ -196,6 +197,9 @@ namespace __reduce { { cub::GridMappingStrategy grid_mapping; + THRUST_RUNTIME_FUNCTION + Plan() {} + template THRUST_RUNTIME_FUNCTION Plan(P) : core::AgentPlan(P()), @@ -1018,14 +1022,18 @@ T reduce_n(execution_policy& policy, T init, BinaryOp binary_op) { - if (__THRUST_HAS_CUDART__) - return thrust::cuda_cub::detail::reduce_n_impl( - policy, first, num_items, init, binary_op); - - #if !__THRUST_HAS_CUDART__ - return thrust::reduce( - cvt_to_seq(derived_cast(policy)), first, first + num_items, init, binary_op); - #endif + THRUST_CDP_DISPATCH((init = + thrust::cuda_cub::detail::reduce_n_impl(policy, + first, + num_items, + init, + binary_op);), + (init = thrust::reduce(cvt_to_seq(derived_cast(policy)), + first, + first + num_items, + init, + binary_op);)); + return init; } template diff --git a/thrust/system/cuda/detail/reduce_by_key.h b/thrust/system/cuda/detail/reduce_by_key.h index 87a5bb454..5cf23a99c 100644 --- a/thrust/system/cuda/detail/reduce_by_key.h +++ b/thrust/system/cuda/detail/reduce_by_key.h @@ -29,25 +29,26 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include -#include +#include #include -#include -#include +#include +#include #include +#include #include -#include -#include +#include +#include +#include +#include +#include +#include #include #include -#include -#include -#include -#include -#include -#include +#include +#include +#include #include THRUST_NAMESPACE_BEGIN @@ -1122,35 +1123,27 @@ reduce_by_key(execution_policy &policy, BinaryPred binary_pred, BinaryOp binary_op) { - pair ret = thrust::make_pair(keys_output, values_output); - if (__THRUST_HAS_CUDART__) - { - ret = __reduce_by_key::reduce_by_key(policy, - keys_first, - keys_last, - values_first, - keys_output, - values_output, - binary_pred, - binary_op); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::reduce_by_key(cvt_to_seq(derived_cast(policy)), - keys_first, - keys_last, - values_first, - keys_output, - values_output, - binary_pred, - binary_op); -#endif - } + auto ret = thrust::make_pair(keys_output, values_output); + THRUST_CDP_DISPATCH((ret = __reduce_by_key::reduce_by_key(policy, + keys_first, + keys_last, + values_first, + keys_output, + values_output, + binary_pred, + binary_op);), + (ret = + thrust::reduce_by_key(cvt_to_seq(derived_cast(policy)), + keys_first, + keys_last, + values_first, + keys_output, + values_output, + binary_pred, + binary_op);)); return ret; } - template #include #include +#include #include #include @@ -220,26 +221,18 @@ OutputIt inclusive_scan_n(thrust::cuda_cub::execution_policy &policy, OutputIt result, ScanOp scan_op) { - OutputIt ret = result; - if (__THRUST_HAS_CUDART__) - { - ret = thrust::cuda_cub::detail::inclusive_scan_n_impl(policy, - first, - num_items, - result, - scan_op); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::inclusive_scan(cvt_to_seq(derived_cast(policy)), - first, - first + num_items, - result, - scan_op); -#endif - } - return ret; + THRUST_CDP_DISPATCH( + (result = thrust::cuda_cub::detail::inclusive_scan_n_impl(policy, + first, + num_items, + result, + scan_op);), + (result = thrust::inclusive_scan(cvt_to_seq(derived_cast(policy)), + first, + first + num_items, + result, + scan_op);)); + return result; } template @@ -288,28 +281,20 @@ OutputIt exclusive_scan_n(thrust::cuda_cub::execution_policy &policy, T init, ScanOp scan_op) { - OutputIt ret = result; - if (__THRUST_HAS_CUDART__) - { - ret = thrust::cuda_cub::detail::exclusive_scan_n_impl(policy, - first, - num_items, - result, - init, - scan_op); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::exclusive_scan(cvt_to_seq(derived_cast(policy)), - first, - first + num_items, - result, - init, - scan_op); -#endif - } - return ret; + THRUST_CDP_DISPATCH( + (result = thrust::cuda_cub::detail::exclusive_scan_n_impl(policy, + first, + num_items, + result, + init, + scan_op);), + (result = thrust::exclusive_scan(cvt_to_seq(derived_cast(policy)), + first, + first + num_items, + result, + init, + scan_op);)); + return result; } template #include +#include +#include #include #include +#include #include #include #include @@ -305,29 +308,23 @@ inclusive_scan_by_key(execution_policy &policy, ScanOp scan_op) { ValOutputIt ret = value_result; - if (__THRUST_HAS_CUDART__) - { - ret = thrust::cuda_cub::detail::inclusive_scan_by_key_n( - policy, - key_first, - value_first, - value_result, - thrust::distance(key_first, key_last), - binary_pred, - scan_op); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::inclusive_scan_by_key(cvt_to_seq(derived_cast(policy)), - key_first, - key_last, - value_first, - value_result, - binary_pred, - scan_op); -#endif - } + THRUST_CDP_DISPATCH( + (ret = thrust::cuda_cub::detail::inclusive_scan_by_key_n( + policy, + key_first, + value_first, + value_result, + thrust::distance(key_first, key_last), + binary_pred, + scan_op);), + (ret = thrust::inclusive_scan_by_key(cvt_to_seq(derived_cast(policy)), + key_first, + key_last, + value_first, + value_result, + binary_pred, + scan_op);)); + return ret; } @@ -396,31 +393,24 @@ exclusive_scan_by_key(execution_policy &policy, ScanOp scan_op) { ValOutputIt ret = value_result; - if (__THRUST_HAS_CUDART__) - { - ret = thrust::cuda_cub::detail::exclusive_scan_by_key_n( - policy, - key_first, - value_first, - value_result, - thrust::distance(key_first, key_last), - init, - binary_pred, - scan_op); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::exclusive_scan_by_key(cvt_to_seq(derived_cast(policy)), - key_first, - key_last, - value_first, - value_result, - init, - binary_pred, - scan_op); -#endif - } + THRUST_CDP_DISPATCH( + (ret = thrust::cuda_cub::detail::exclusive_scan_by_key_n( + policy, + key_first, + value_first, + value_result, + thrust::distance(key_first, key_last), + init, + binary_pred, + scan_op);), + (ret = thrust::exclusive_scan_by_key(cvt_to_seq(derived_cast(policy)), + key_first, + key_last, + value_first, + value_result, + init, + binary_pred, + scan_op);)); return ret; } diff --git a/thrust/system/cuda/detail/set_operations.h b/thrust/system/cuda/detail/set_operations.h index 58e67547c..1bc942460 100644 --- a/thrust/system/cuda/detail/set_operations.h +++ b/thrust/system/cuda/detail/set_operations.h @@ -29,20 +29,22 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include +#include #include +#include #include -#include -#include -#include -#include +#include #include #include #include -#include -#include -#include +#include +#include +#include +#include +#include +#include + THRUST_NAMESPACE_BEGIN @@ -1363,38 +1365,30 @@ set_difference(execution_policy &policy, OutputIt result, CompareOp compare) { - OutputIt ret = result; - if (__THRUST_HAS_CUDART__) - { - typename thrust::iterator_value::type *null_ = NULL; - // - ret = __set_operations::set_operations( - policy, - items1_first, - items1_last, - items2_first, - items2_last, - null_, - null_, - result, - null_, - compare, - __set_operations::serial_set_difference()) - .first; - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::set_difference(cvt_to_seq(derived_cast(policy)), - items1_first, - items1_last, - items2_first, - items2_last, - result, - compare); -#endif - } - return ret; + THRUST_CDP_DISPATCH( + (using items1_t = thrust::iterator_value_t; + items1_t *null_ = nullptr; + auto tmp = __set_operations::set_operations( + policy, + items1_first, + items1_last, + items2_first, + items2_last, + null_, + null_, + result, + null_, + compare, + __set_operations::serial_set_difference()); + result = tmp.first;), + (result = thrust::set_difference(cvt_to_seq(derived_cast(policy)), + items1_first, + items1_last, + items2_first, + items2_last, + result, + compare);)); + return result; } template &policy, OutputIt result, CompareOp compare) { - OutputIt ret = result; - if (__THRUST_HAS_CUDART__) - { - typename thrust::iterator_value::type *null_ = NULL; - // - ret = __set_operations::set_operations( - policy, - items1_first, - items1_last, - items2_first, - items2_last, - null_, - null_, - result, - null_, - compare, - __set_operations::serial_set_intersection()) - .first; - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::set_intersection(cvt_to_seq(derived_cast(policy)), - items1_first, - items1_last, - items2_first, - items2_last, - result, - compare); -#endif - } - return ret; + THRUST_CDP_DISPATCH( + (using items1_t = thrust::iterator_value_t; + items1_t *null_ = NULL; + auto tmp = __set_operations::set_operations( + policy, + items1_first, + items1_last, + items2_first, + items2_last, + null_, + null_, + result, + null_, + compare, + __set_operations::serial_set_intersection()); + result = tmp.first;), + (result = thrust::set_intersection(cvt_to_seq(derived_cast(policy)), + items1_first, + items1_last, + items2_first, + items2_last, + result, + compare);)); + return result; } template &policy, OutputIt result, CompareOp compare) { - OutputIt ret = result; - if (__THRUST_HAS_CUDART__) - { - typename thrust::iterator_value::type *null_ = NULL; - // - ret = __set_operations::set_operations( - policy, - items1_first, - items1_last, - items2_first, - items2_last, - null_, - null_, - result, - null_, - compare, - __set_operations::serial_set_symmetric_difference()) - .first; - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::set_symmetric_difference(cvt_to_seq(derived_cast(policy)), - items1_first, - items1_last, - items2_first, - items2_last, - result, - compare); -#endif - } - return ret; + THRUST_CDP_DISPATCH( + (using items1_t = thrust::iterator_value_t; + items1_t *null_ = nullptr; + auto tmp = __set_operations::set_operations( + policy, + items1_first, + items1_last, + items2_first, + items2_last, + null_, + null_, + result, + null_, + compare, + __set_operations::serial_set_symmetric_difference()); + result = tmp.first;), + (result = thrust::set_symmetric_difference(cvt_to_seq(derived_cast(policy)), + items1_first, + items1_last, + items2_first, + items2_last, + result, + compare);)); + return result; } - template &policy, OutputIt result, CompareOp compare) { - OutputIt ret = result; - if (__THRUST_HAS_CUDART__) - { - typename thrust::iterator_value::type *null_ = NULL; - // - ret = __set_operations::set_operations( - policy, - items1_first, - items1_last, - items2_first, - items2_last, - null_, - null_, - result, - null_, - compare, - __set_operations::serial_set_union()) - .first; - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::set_union(cvt_to_seq(derived_cast(policy)), - items1_first, - items1_last, - items2_first, - items2_last, - result, - compare); -#endif - } - return ret; + THRUST_CDP_DISPATCH( + (using items1_t = thrust::iterator_value_t; + items1_t *null_ = nullptr; + auto tmp = __set_operations::set_operations( + policy, + items1_first, + items1_last, + items2_first, + items2_last, + null_, + null_, + result, + null_, + compare, + __set_operations::serial_set_union()); + result = tmp.first;), + (result = thrust::set_union(cvt_to_seq(derived_cast(policy)), + items1_first, + items1_last, + items2_first, + items2_last, + result, + compare);)); + return result; } - template &policy, ItemsOutputIt items_result, CompareOp compare_op) { - pair ret = thrust::make_pair(keys_result, items_result); - if (__THRUST_HAS_CUDART__) - { - ret = __set_operations::set_operations( - policy, - keys1_first, - keys1_last, - keys2_first, - keys2_last, - items1_first, - items2_first, - keys_result, - items_result, - compare_op, - __set_operations::serial_set_difference()); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::set_difference_by_key(cvt_to_seq(derived_cast(policy)), - keys1_first, - keys1_last, - keys2_first, - keys2_last, - items1_first, - items2_first, - keys_result, - items_result, - compare_op); -#endif - } + auto ret = thrust::make_pair(keys_result, items_result); + THRUST_CDP_DISPATCH( + (ret = __set_operations::set_operations( + policy, + keys1_first, + keys1_last, + keys2_first, + keys2_last, + items1_first, + items2_first, + keys_result, + items_result, + compare_op, + __set_operations::serial_set_difference());), + (ret = thrust::set_difference_by_key(cvt_to_seq(derived_cast(policy)), + keys1_first, + keys1_last, + keys2_first, + keys2_last, + items1_first, + items2_first, + keys_result, + items_result, + compare_op);)); return ret; } @@ -1759,36 +1720,29 @@ set_intersection_by_key(execution_policy &policy, ItemsOutputIt items_result, CompareOp compare_op) { - pair ret = thrust::make_pair(keys_result, items_result); - if (__THRUST_HAS_CUDART__) - { - ret = __set_operations::set_operations( - policy, - keys1_first, - keys1_last, - keys2_first, - keys2_last, - items1_first, - items1_first, - keys_result, - items_result, - compare_op, - __set_operations::serial_set_intersection()); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::set_intersection_by_key(cvt_to_seq(derived_cast(policy)), - keys1_first, - keys1_last, - keys2_first, - keys2_last, - items1_first, - keys_result, - items_result, - compare_op); -#endif - } + auto ret = thrust::make_pair(keys_result, items_result); + THRUST_CDP_DISPATCH( + (ret = __set_operations::set_operations( + policy, + keys1_first, + keys1_last, + keys2_first, + keys2_last, + items1_first, + items1_first, + keys_result, + items_result, + compare_op, + __set_operations::serial_set_intersection());), + (ret = thrust::set_intersection_by_key(cvt_to_seq(derived_cast(policy)), + keys1_first, + keys1_last, + keys2_first, + keys2_last, + items1_first, + keys_result, + items_result, + compare_op);)); return ret; } @@ -1844,37 +1798,31 @@ set_symmetric_difference_by_key(execution_policy &policy, ItemsOutputIt items_result, CompareOp compare_op) { - pair ret = thrust::make_pair(keys_result, items_result); - if (__THRUST_HAS_CUDART__) - { - ret = __set_operations::set_operations( - policy, - keys1_first, - keys1_last, - keys2_first, - keys2_last, - items1_first, - items2_first, - keys_result, - items_result, - compare_op, - __set_operations::serial_set_symmetric_difference()); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::set_symmetric_difference_by_key(cvt_to_seq(derived_cast(policy)), - keys1_first, - keys1_last, - keys2_first, - keys2_last, - items1_first, - items2_first, - keys_result, - items_result, - compare_op); -#endif - } + auto ret = thrust::make_pair(keys_result, items_result); + THRUST_CDP_DISPATCH( + (ret = __set_operations::set_operations( + policy, + keys1_first, + keys1_last, + keys2_first, + keys2_last, + items1_first, + items2_first, + keys_result, + items_result, + compare_op, + __set_operations::serial_set_symmetric_difference());), + (ret = + thrust::set_symmetric_difference_by_key(cvt_to_seq(derived_cast(policy)), + keys1_first, + keys1_last, + keys2_first, + keys2_last, + items1_first, + items2_first, + keys_result, + items_result, + compare_op);)); return ret; } @@ -1932,37 +1880,30 @@ set_union_by_key(execution_policy &policy, ItemsOutputIt items_result, CompareOp compare_op) { - pair ret = thrust::make_pair(keys_result, items_result); - if (__THRUST_HAS_CUDART__) - { - ret = __set_operations::set_operations( - policy, - keys1_first, - keys1_last, - keys2_first, - keys2_last, - items1_first, - items2_first, - keys_result, - items_result, - compare_op, - __set_operations::serial_set_union()); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::set_union_by_key(cvt_to_seq(derived_cast(policy)), - keys1_first, - keys1_last, - keys2_first, - keys2_last, - items1_first, - items2_first, - keys_result, - items_result, - compare_op); -#endif - } + auto ret = thrust::make_pair(keys_result, items_result); + THRUST_CDP_DISPATCH( + (ret = __set_operations::set_operations( + policy, + keys1_first, + keys1_last, + keys2_first, + keys2_last, + items1_first, + items2_first, + keys_result, + items_result, + compare_op, + __set_operations::serial_set_union());), + (ret = thrust::set_union_by_key(cvt_to_seq(derived_cast(policy)), + keys1_first, + keys1_last, + keys2_first, + keys2_last, + items1_first, + items2_first, + keys_result, + items_result, + compare_op);)); return ret; } diff --git a/thrust/system/cuda/detail/sort.h b/thrust/system/cuda/detail/sort.h index 4babc3383..94c2c3b37 100644 --- a/thrust/system/cuda/detail/sort.h +++ b/thrust/system/cuda/detail/sort.h @@ -29,26 +29,29 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include -#include -#include +#include +#include +#include +#include + #include +#include #include #include -#include -#include - #include #include -#include -#include -#include -#include -#include -#include +#include + #include +#include +#include +#include +#include + #include +#include +#include THRUST_NAMESPACE_BEGIN namespace cuda_cub { @@ -515,18 +518,15 @@ sort(execution_policy& policy, ItemsIt last, CompareOp compare_op) { - if (__THRUST_HAS_CUDART__) - { - typedef typename thrust::iterator_value::type item_type; - __smart_sort::smart_sort( - policy, first, last, (item_type*)NULL, compare_op); - } - else - { -#if !__THRUST_HAS_CUDART__ - thrust::sort(cvt_to_seq(derived_cast(policy)), first, last, compare_op); -#endif - } + THRUST_CDP_DISPATCH( + (using item_t = thrust::iterator_value_t; item_t *null_ = nullptr; + __smart_sort::smart_sort(policy, + first, + last, + null_, + compare_op);), + (thrust::sort(cvt_to_seq(derived_cast(policy)), first, last, compare_op);)); } __thrust_exec_check_disable__ @@ -537,18 +537,18 @@ stable_sort(execution_policy& policy, ItemsIt last, CompareOp compare_op) { - if (__THRUST_HAS_CUDART__) - { - typedef typename thrust::iterator_value::type item_type; - __smart_sort::smart_sort( - policy, first, last, (item_type*)NULL, compare_op); - } - else - { -#if !__THRUST_HAS_CUDART__ - thrust::stable_sort(cvt_to_seq(derived_cast(policy)), first, last, compare_op); -#endif - } + THRUST_CDP_DISPATCH( + (using item_t = thrust::iterator_value_t; item_t *null_ = nullptr; + __smart_sort::smart_sort(policy, + first, + last, + null_, + compare_op);), + (thrust::stable_sort(cvt_to_seq(derived_cast(policy)), + first, + last, + compare_op);)); } __thrust_exec_check_disable__ @@ -560,18 +560,18 @@ sort_by_key(execution_policy& policy, ValuesIt values, CompareOp compare_op) { - if (__THRUST_HAS_CUDART__) - { - __smart_sort::smart_sort( - policy, keys_first, keys_last, values, compare_op); - } - else - { -#if !__THRUST_HAS_CUDART__ - thrust::sort_by_key( - cvt_to_seq(derived_cast(policy)), keys_first, keys_last, values, compare_op); -#endif - } + THRUST_CDP_DISPATCH( + (__smart_sort::smart_sort(policy, + keys_first, + keys_last, + values, + compare_op);), + (thrust::sort_by_key(cvt_to_seq(derived_cast(policy)), + keys_first, + keys_last, + values, + compare_op);)); } __thrust_exec_check_disable__ @@ -586,18 +586,18 @@ stable_sort_by_key(execution_policy &policy, ValuesIt values, CompareOp compare_op) { - if (__THRUST_HAS_CUDART__) - { - __smart_sort::smart_sort( - policy, keys_first, keys_last, values, compare_op); - } - else - { -#if !__THRUST_HAS_CUDART__ - thrust::stable_sort_by_key( - cvt_to_seq(derived_cast(policy)), keys_first, keys_last, values, compare_op); -#endif - } + THRUST_CDP_DISPATCH( + (__smart_sort::smart_sort(policy, + keys_first, + keys_last, + values, + compare_op);), + (thrust::stable_sort_by_key(cvt_to_seq(derived_cast(policy)), + keys_first, + keys_last, + values, + compare_op);)); } // API with default comparator diff --git a/thrust/system/cuda/detail/unique.h b/thrust/system/cuda/detail/unique.h index d41819605..621b0289c 100644 --- a/thrust/system/cuda/detail/unique.h +++ b/thrust/system/cuda/detail/unique.h @@ -29,21 +29,20 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include -#include -#include -#include +#include #include -#include -#include -#include -#include -#include #include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include #include THRUST_NAMESPACE_BEGIN @@ -730,26 +729,14 @@ unique_copy(execution_policy &policy, OutputIt result, BinaryPred binary_pred) { - OutputIt ret = result; - if (__THRUST_HAS_CUDART__) - { - ret = __unique::unique(policy, - first, - last, - result, - binary_pred); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::unique_copy(cvt_to_seq(derived_cast(policy)), - first, - last, - result, - binary_pred); -#endif - } - return ret; + THRUST_CDP_DISPATCH( + (result = __unique::unique(policy, first, last, result, binary_pred);), + (result = thrust::unique_copy(cvt_to_seq(derived_cast(policy)), + first, + last, + result, + binary_pred);)); + return result; } template &policy, BinaryPred binary_pred) { ForwardIt ret = first; - if (__THRUST_HAS_CUDART__) - { - ret = cuda_cub::unique_copy(policy, first, last, first, binary_pred); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::unique(cvt_to_seq(derived_cast(policy)), - first, - last, - binary_pred); -#endif - } + THRUST_CDP_DISPATCH( + (ret = cuda_cub::unique_copy(policy, first, last, first, binary_pred);), + (ret = thrust::unique(cvt_to_seq(derived_cast(policy)), + first, + last, + binary_pred);)); return ret; } diff --git a/thrust/system/cuda/detail/unique_by_key.h b/thrust/system/cuda/detail/unique_by_key.h index 1835bf599..b213ea154 100644 --- a/thrust/system/cuda/detail/unique_by_key.h +++ b/thrust/system/cuda/detail/unique_by_key.h @@ -29,22 +29,23 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include +#include #include #include -#include -#include +#include +#include +#include +#include +#include +#include +#include #include #include #include -#include -#include -#include -#include -#include -#include +#include +#include #include THRUST_NAMESPACE_BEGIN @@ -824,29 +825,22 @@ unique_by_key_copy(execution_policy &policy, ValOutputIt values_result, BinaryPred binary_pred) { - pair ret = thrust::make_pair(keys_result, values_result); - if (__THRUST_HAS_CUDART__) - { - ret = __unique_by_key::unique_by_key(policy, - keys_first, - keys_last, - values_first, - keys_result, - values_result, - binary_pred); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::unique_by_key_copy(cvt_to_seq(derived_cast(policy)), - keys_first, - keys_last, - values_first, - keys_result, - values_result, - binary_pred); -#endif - } + auto ret = thrust::make_pair(keys_result, values_result); + THRUST_CDP_DISPATCH( + (ret = __unique_by_key::unique_by_key(policy, + keys_first, + keys_last, + values_first, + keys_result, + values_result, + binary_pred);), + (ret = thrust::unique_by_key_copy(cvt_to_seq(derived_cast(policy)), + keys_first, + keys_last, + values_first, + keys_result, + values_result, + binary_pred);)); return ret; } @@ -884,27 +878,20 @@ unique_by_key(execution_policy &policy, ValInputIt values_first, BinaryPred binary_pred) { - pair ret = thrust::make_pair(keys_first, values_first); - if (__THRUST_HAS_CUDART__) - { - ret = cuda_cub::unique_by_key_copy(policy, - keys_first, - keys_last, - values_first, - keys_first, - values_first, - binary_pred); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::unique_by_key(cvt_to_seq(derived_cast(policy)), - keys_first, - keys_last, - values_first, - binary_pred); -#endif - } + auto ret = thrust::make_pair(keys_first, values_first); + THRUST_CDP_DISPATCH( + (ret = cuda_cub::unique_by_key_copy(policy, + keys_first, + keys_last, + values_first, + keys_first, + values_first, + binary_pred);), + (ret = thrust::unique_by_key(cvt_to_seq(derived_cast(policy)), + keys_first, + keys_last, + values_first, + binary_pred);)); return ret; } diff --git a/thrust/system/cuda/detail/util.h b/thrust/system/cuda/detail/util.h index 1b6580271..5fcb6432a 100644 --- a/thrust/system/cuda/detail/util.h +++ b/thrust/system/cuda/detail/util.h @@ -206,10 +206,12 @@ terminate() __host__ __device__ inline void throw_on_error(cudaError_t status) { -#if __THRUST_HAS_CUDART__ // Clear the global CUDA error state which may have been set by the last // call. Otherwise, errors may "leak" to unrelated kernel launches. +#ifdef THRUST_RDC_ENABLED cudaGetLastError(); +#else + NV_IF_TARGET(NV_IS_HOST, (cudaGetLastError();)); #endif if (cudaSuccess != status) @@ -217,7 +219,7 @@ inline void throw_on_error(cudaError_t status) // Can't use #if inside NV_IF_TARGET, use a temp macro to hoist the device // instructions out of the target logic. -#if __THRUST_HAS_CUDART__ +#ifdef THRUST_RDC_ENABLED #define THRUST_TEMP_DEVICE_CODE \ printf("Thrust CUDA backend error: %s: %s\n", \ @@ -247,17 +249,19 @@ inline void throw_on_error(cudaError_t status) __host__ __device__ inline void throw_on_error(cudaError_t status, char const *msg) { -#if __THRUST_HAS_CUDART__ // Clear the global CUDA error state which may have been set by the last // call. Otherwise, errors may "leak" to unrelated kernel launches. +#ifdef THRUST_RDC_ENABLED cudaGetLastError(); +#else + NV_IF_TARGET(NV_IS_HOST, (cudaGetLastError();)); #endif if (cudaSuccess != status) { // Can't use #if inside NV_IF_TARGET, use a temp macro to hoist the device // instructions out of the target logic. -#if __THRUST_HAS_CUDART__ +#ifdef THRUST_RDC_ENABLED #define THRUST_TEMP_DEVICE_CODE \ printf("Thrust CUDA backend error: %s: %s: %s\n", \ From 3b4d8389b7c49ded35b098127ba6ee94eeee0f1b Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Wed, 27 Apr 2022 17:37:39 -0400 Subject: [PATCH 2/3] Add testing for CDP seq fallbacks when RDC is disabled. --- testing/cuda/CMakeLists.txt | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/testing/cuda/CMakeLists.txt b/testing/cuda/CMakeLists.txt index 6df1b19c0..c1e7a545c 100644 --- a/testing/cuda/CMakeLists.txt +++ b/testing/cuda/CMakeLists.txt @@ -6,6 +6,10 @@ file(GLOB test_srcs # These tests always build with RDC, so make sure that the sm_XX flags are # compatible. See note in ThrustCudaConfig.cmake. +# TODO once we're using CUDA_ARCHITECTURES, we can setup non-rdc fallback +# tests to build for non-rdc arches. But for now, all files in a given directory +# must build with the same `CMAKE_CUDA_FLAGS` due to CMake constraints around +# how CUDA_FLAGS works. set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_RDC}") foreach(thrust_target IN LISTS THRUST_TARGETS) @@ -18,11 +22,11 @@ foreach(thrust_target IN LISTS THRUST_TARGETS) get_filename_component(test_name "${test_src}" NAME_WLE) string(PREPEND test_name "cuda.") - thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target}) - - # All in testing/cuda will test device-side launch (aka calling parallel - # algorithms from device code), which requires the CUDA device-side runtime, - # which requires RDC, so these always need to be built with RDC. - thrust_enable_rdc_for_cuda_target(${test_target}) + # Create two targets, one with RDC enabled, the other without. This tests + # both device-side behaviors -- the CDP kernel launch with RDC, and the + # serial fallback path without RDC. + thrust_add_test(seq_test_target ${test_name}.cdp_0 "${test_src}" ${thrust_target}) + thrust_add_test(cdp_test_target ${test_name}.cdp_1 "${test_src}" ${thrust_target}) + thrust_enable_rdc_for_cuda_target(${cdp_test_target}) endforeach() endforeach() From cb30a6b4140d579421aa20f2589b849bf841e2b3 Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Tue, 17 May 2022 18:27:20 -0400 Subject: [PATCH 3/3] Use DebugSyncStream where appropriate. --- thrust/system/cuda/detail/core/agent_launcher.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/system/cuda/detail/core/agent_launcher.h index b604f293e..b9ecbe2e3 100644 --- a/thrust/system/cuda/detail/core/agent_launcher.h +++ b/thrust/system/cuda/detail/core/agent_launcher.h @@ -512,12 +512,7 @@ namespace core { THRUST_RUNTIME_FUNCTION void sync() const { - if (debug_sync) - { - NV_IF_TARGET(NV_IS_HOST, - (cudaStreamSynchronize(stream);), - (cub::detail::device_synchronize();)); - } + CubDebug(cub::detail::DebugSyncStream(stream, this->debug_sync)); } template