diff --git a/cub/detail/detect_cuda_runtime.cuh b/cub/detail/detect_cuda_runtime.cuh new file mode 100644 index 0000000000..7ac947277d --- /dev/null +++ b/cub/detail/detect_cuda_runtime.cuh @@ -0,0 +1,100 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Utilities for CUDA dynamic parallelism. + */ + +#pragma once + +#include + +#include + +CUB_NAMESPACE_BEGIN +namespace detail +{ + +#ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes: + +/** + * \def CUB_RDC_ENABLED + * + * Defined if RDC is enabled. + */ +#define CUB_RDC_ENABLED + +/** + * \def CUB_RUNTIME_FUNCTION + * + * Execution space for functions that can use the CUDA runtime API (`__host__` + * when RDC is off, `__host__ __device__` when RDC is on). + */ +#define CUB_RUNTIME_FUNCTION + +/** + * \def CUB_RUNTIME_ENABLED + * + * Whether or not the active compiler pass is allowed to invoke device kernels + * or methods from the CUDA runtime API. + * + * This macro should not be used in CUB, as it depends on `__CUDA_ARCH__` + * and is not compatible with `NV_IF_TARGET`. It is provided for legacy + * purposes only. + * + * Replace any usages with `CUB_RDC_ENABLED` and `NV_IF_TARGET`. + */ +#define CUB_RUNTIME_ENABLED + +#else // Non-doxygen pass: + +#ifndef CUB_RUNTIME_FUNCTION + +#if defined(__CUDACC_RDC__) + +#define CUB_RDC_ENABLED +#define CUB_RUNTIME_FUNCTION __host__ __device__ + +#else // RDC disabled: + +#define CUB_RUNTIME_FUNCTION __host__ + +#endif // RDC enabled + +#if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__) +// Legacy only -- do not use in new code. +#define CUB_RUNTIME_ENABLED +#endif + +#endif // CUB_RUNTIME_FUNCTION predefined + +#endif // Do not document + +} // namespace detail +CUB_NAMESPACE_END diff --git a/cub/detail/device_synchronize.cuh b/cub/detail/device_synchronize.cuh index 52c5a10663..1a868ff637 100644 --- a/cub/detail/device_synchronize.cuh +++ b/cub/detail/device_synchronize.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -38,8 +39,6 @@ CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize() { cudaError_t result = cudaErrorUnknown; -#ifdef CUB_RUNTIME_ENABLED - #if defined(__CUDACC__) && \ ((__CUDACC_VER_MAJOR__ > 11) || \ ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6))) @@ -48,11 +47,6 @@ CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize() result = __cudaDeviceSynchronizeDeprecationAvoidance(); #else // CUDA < 11.6 #define CUB_TMP_DEVICE_SYNC_IMPL result = cudaDeviceSynchronize(); -#endif - -#else // Device code without the CUDA runtime. - // Device side CUDA API calls are not supported in this configuration. -#define CUB_TMP_DEVICE_SYNC_IMPL result = cudaErrorInvalidConfiguration; #endif NV_IF_TARGET(NV_IS_HOST, diff --git a/cub/detail/type_traits.cuh b/cub/detail/type_traits.cuh index 9b28e2c959..803dbf74ee 100644 --- a/cub/detail/type_traits.cuh +++ b/cub/detail/type_traits.cuh @@ -32,8 +32,8 @@ #pragma once -#include "../util_cpp_dialect.cuh" -#include "../util_namespace.cuh" +#include +#include #include diff --git a/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/device/dispatch/dispatch_adjacent_difference.cuh index 676ae6bfaf..60f37b2adb 100644 --- a/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -259,12 +259,10 @@ struct DispatchAdjacentDifference : public SelectedPolicy num_tiles, tile_size); - if (debug_synchronous) + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - break; - } + break; } // Check for failure to launch @@ -302,12 +300,11 @@ struct DispatchAdjacentDifference : public SelectedPolicy difference_op, num_items); - if (debug_synchronous) + + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - break; - } + break; } // Check for failure to launch diff --git a/cub/device/dispatch/dispatch_histogram.cuh b/cub/device/dispatch/dispatch_histogram.cuh index c78f125a47..1fae8638d0 100644 --- a/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/device/dispatch/dispatch_histogram.cuh @@ -35,12 +35,12 @@ #pragma once #include +#include +#include +#include #include #include #include -#include -#include -#include #include @@ -530,13 +530,6 @@ public: cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. { - #ifndef CUB_RUNTIME_ENABLED - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - - #else - cudaError error = cudaSuccess; do { @@ -674,17 +667,21 @@ public: tile_queue); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } while (0); return error; - - #endif // CUB_RUNTIME_ENABLED } diff --git a/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/device/dispatch/dispatch_merge_sort.cuh index 0b480899c5..86f1f8cd0e 100644 --- a/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/device/dispatch/dispatch_merge_sort.cuh @@ -704,12 +704,10 @@ struct DispatchMergeSort : SelectedPolicy block_sort_launcher.launch(); - if (debug_synchronous) + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - break; - } + break; } // Check for failure to launch @@ -769,12 +767,10 @@ struct DispatchMergeSort : SelectedPolicy target_merged_tiles_number, tile_size); - if (debug_synchronous) + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - break; - } + break; } // Check for failure to launch @@ -786,12 +782,10 @@ struct DispatchMergeSort : SelectedPolicy // Merge merge_launcher.launch(ping, target_merged_tiles_number); - if (debug_synchronous) + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - break; - } + break; } // Check for failure to launch diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh index 37cd74d84c..fa189ba7b8 100644 --- a/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/device/dispatch/dispatch_radix_sort.cuh @@ -1044,11 +1044,6 @@ struct DispatchRadixSort : cudaError_t InvokeSingleTile( SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel { -#ifndef CUB_RUNTIME_ENABLED - (void)single_tile_kernel; - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); -#else cudaError error = cudaSuccess; do { @@ -1078,10 +1073,17 @@ struct DispatchRadixSort : end_bit); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Update selector d_keys.selector ^= 1; @@ -1090,8 +1092,6 @@ struct DispatchRadixSort : while (0); return error; - -#endif // CUB_RUNTIME_ENABLED } @@ -1141,10 +1141,17 @@ struct DispatchRadixSort : pass_config.even_share); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Log scan_kernel configuration if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", @@ -1158,10 +1165,17 @@ struct DispatchRadixSort : pass_spine_length); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Log downsweep_kernel configuration if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", @@ -1184,10 +1198,17 @@ struct DispatchRadixSort : pass_config.even_share); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Update current bit current_bit += pass_bits; @@ -1352,14 +1373,13 @@ struct DispatchRadixSort : { break; } - if (debug_synchronous) + + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - break; - } + break; } - + // exclusive sums to determine starts const int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS; @@ -1377,14 +1397,13 @@ struct DispatchRadixSort : d_bins); if (CubDebug(error)) { - break; + break; } - if (debug_synchronous) + + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - break; - } + break; } // use the other buffer if no overwrite is allowed @@ -1442,12 +1461,10 @@ struct DispatchRadixSort : break; } - if (debug_synchronous) + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - break; - } + break; } } @@ -1483,17 +1500,6 @@ struct DispatchRadixSort : DownsweepKernelT downsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel DownsweepKernelT alt_downsweep_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel { -#ifndef CUB_RUNTIME_ENABLED - (void)upsweep_kernel; - (void)alt_upsweep_kernel; - (void)scan_kernel; - (void)downsweep_kernel; - (void)alt_downsweep_kernel; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); -#else - cudaError error = cudaSuccess; do { @@ -1606,8 +1612,6 @@ struct DispatchRadixSort : while (0); return error; - -#endif // CUB_RUNTIME_ENABLED } @@ -1847,10 +1851,17 @@ struct DispatchSegmentedRadixSort : current_bit, pass_bits); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Update current bit current_bit += pass_bits; @@ -1893,14 +1904,6 @@ struct DispatchSegmentedRadixSort : SegmentedKernelT segmented_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel SegmentedKernelT alt_segmented_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel { -#ifndef CUB_RUNTIME_ENABLED - (void)segmented_kernel; - (void)alt_segmented_kernel; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); -#else - cudaError error = cudaSuccess; do { @@ -1979,8 +1982,6 @@ struct DispatchSegmentedRadixSort : while (0); return error; - -#endif // CUB_RUNTIME_ENABLED } diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh index 0ac3b7b3c0..ff2adf2005 100644 --- a/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/device/dispatch/dispatch_reduce.cuh @@ -387,12 +387,6 @@ struct DispatchReduce : cudaError_t InvokeSingleTile( SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel { -#ifndef CUB_RUNTIME_ENABLED - (void)single_tile_kernel; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); -#else cudaError error = cudaSuccess; do { @@ -420,16 +414,21 @@ struct DispatchReduce : init); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } while (0); return error; - -#endif // CUB_RUNTIME_ENABLED } @@ -447,14 +446,6 @@ struct DispatchReduce : ReduceKernelT reduce_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel { -#ifndef CUB_RUNTIME_ENABLED - (void) reduce_kernel; - (void) single_tile_kernel; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); -#else - cudaError error = cudaSuccess; do { @@ -517,10 +508,17 @@ struct DispatchReduce : reduction_op); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Log single_reduce_sweep_kernel configuration if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n", @@ -539,17 +537,21 @@ struct DispatchReduce : init); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } while (0); return error; - -#endif // CUB_RUNTIME_ENABLED - } @@ -717,11 +719,6 @@ struct DispatchSegmentedReduce : cudaError_t InvokePasses( DeviceSegmentedReduceKernelT segmented_reduce_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel { -#ifndef CUB_RUNTIME_ENABLED - (void)segmented_reduce_kernel; - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); -#else cudaError error = cudaSuccess; do { @@ -758,17 +755,21 @@ struct DispatchSegmentedReduce : init); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } while (0); return error; - -#endif // CUB_RUNTIME_ENABLED - } diff --git a/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/device/dispatch/dispatch_reduce_by_key.cuh index 952ace4240..e292abbc56 100644 --- a/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -260,29 +260,6 @@ struct DispatchReduceByKey ReduceByKeyKernelT reduce_by_key_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel KernelConfig reduce_by_key_config) ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for { - -#ifndef CUB_RUNTIME_ENABLED - (void)d_temp_storage; - (void)temp_storage_bytes; - (void)d_keys_in; - (void)d_unique_out; - (void)d_values_in; - (void)d_aggregates_out; - (void)d_num_runs_out; - (void)equality_op; - (void)reduction_op; - (void)num_items; - (void)stream; - (void)debug_synchronous; - (void)init_kernel; - (void)reduce_by_key_kernel; - (void)reduce_by_key_config; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - -#else - cudaError error = cudaSuccess; do { @@ -324,14 +301,23 @@ struct DispatchReduceByKey d_num_runs_out); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Return if empty problem if (num_items == 0) + { break; + } // Get SM occupancy for reduce_by_key_kernel int reduce_by_key_sm_occupancy; @@ -369,17 +355,22 @@ struct DispatchReduceByKey num_items); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } } while (0); return error; - -#endif // CUB_RUNTIME_ENABLED } diff --git a/cub/device/dispatch/dispatch_rle.cuh b/cub/device/dispatch/dispatch_rle.cuh index 692a55d4e9..35ab8aa0fb 100644 --- a/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/device/dispatch/dispatch_rle.cuh @@ -261,14 +261,6 @@ struct DeviceRleDispatch DeviceRleSweepKernelPtr device_rle_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel KernelConfig device_rle_config) ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for { - -#ifndef CUB_RUNTIME_ENABLED - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - -#else - cudaError error = cudaSuccess; do { @@ -310,14 +302,23 @@ struct DeviceRleDispatch d_num_runs_out); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Return if empty problem if (num_items == 0) + { break; + } // Get SM occupancy for device_rle_sweep_kernel int device_rle_kernel_sm_occupancy; @@ -354,17 +355,21 @@ struct DeviceRleDispatch num_tiles); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } while (0); return error; - -#endif // CUB_RUNTIME_ENABLED } diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh index 2d452d3323..f59097fcd9 100644 --- a/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/device/dispatch/dispatch_scan.cuh @@ -276,16 +276,6 @@ struct DispatchScan: CUB_RUNTIME_FUNCTION __host__ __forceinline__ cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel) { -#ifndef CUB_RUNTIME_ENABLED - - (void)init_kernel; - (void)scan_kernel; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - -#else - typedef typename ActivePolicyT::ScanPolicyT Policy; typedef typename cub::ScanTileState ScanTileStateT; @@ -339,11 +329,17 @@ struct DispatchScan: num_tiles); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Get SM occupancy for scan_kernel int scan_sm_occupancy; @@ -377,17 +373,22 @@ struct DispatchScan: num_items); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } } while (0); return error; - -#endif // CUB_RUNTIME_ENABLED } template diff --git a/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/device/dispatch/dispatch_scan_by_key.cuh index ff22a6208c..3557f19a16 100644 --- a/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -281,15 +281,6 @@ struct DispatchScanByKey: CUB_RUNTIME_FUNCTION __host__ __forceinline__ cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel) { -#ifndef CUB_RUNTIME_ENABLED - - (void)init_kernel; - (void)scan_kernel; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - -#else typedef typename ActivePolicyT::ScanByKeyPolicyT Policy; typedef ReduceByKeyScanTileState ScanByKeyTileStateT; @@ -339,10 +330,17 @@ struct DispatchScanByKey: ).doit(init_kernel, tile_state, d_keys_in, d_keys_prev_in, tile_size, num_tiles); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Get SM occupancy for scan_kernel int scan_sm_occupancy; @@ -380,17 +378,22 @@ struct DispatchScanByKey: num_items); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } } while (0); return error; - -#endif // CUB_RUNTIME_ENABLED } template diff --git a/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/device/dispatch/dispatch_segmented_sort.cuh index 6fb68f9002..becd887a30 100644 --- a/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -605,12 +605,10 @@ DeviceSegmentedSortContinuation( } // Sync the stream if specified to flush runtime errors - if (debug_synchronous) + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - return error; - } + return error; } } @@ -666,18 +664,17 @@ DeviceSegmentedSortContinuation( } // Sync the stream if specified to flush runtime errors - if (debug_synchronous) + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - return error; - } + return error; } } return error; } +#ifdef CUB_RDC_ENABLED /* * Continuation kernel is used only in the CDP mode. It's used to * launch DeviceSegmentedSortContinuation as a separate kernel. @@ -721,7 +718,6 @@ DeviceSegmentedSortContinuationKernel( // // Due to (4, 5), we can't pass the user-provided stream in the continuation. // Due to (1, 2, 3) it's safe to pass the main stream. - #ifdef CUB_RUNTIME_ENABLED cudaError_t error = DeviceSegmentedSortContinuation( large_kernel, @@ -742,11 +738,8 @@ DeviceSegmentedSortContinuationKernel( debug_synchronous); CubDebug(error); - #else - // Kernel launch not supported from this device - CubDebug(cudaErrorNotSupported); - #endif } +#endif // CUB_RDC_ENABLED template @@ -1576,7 +1569,7 @@ private: THRUST_NS_QUALIFIER::make_reverse_iterator( large_and_medium_segments_indices.get() + num_segments); - if (CubDebug(error = cub::DevicePartition::If( + error = cub::DevicePartition::If( device_partition_temp_storage.get(), three_way_partition_temp_storage_bytes, THRUST_NS_QUALIFIER::counting_iterator(0), @@ -1588,53 +1581,60 @@ private: large_segments_selector, small_segments_selector, stream, - debug_synchronous))) + debug_synchronous); + if (CubDebug(error)) { return error; } -#ifdef CUB_RUNTIME_ENABLED -#define CUB_TMP_DEVICE_CODE \ + // The device path is only used (and only compiles) when CDP is enabled. + // It's defined in a macro since we can't put `#ifdef`s inside of + // `NV_IF_TARGET`. +#ifndef CUB_RDC_ENABLED + +#define CUB_TEMP_DEVICE_CODE + +#else // CUB_RDC_ENABLED + +#define CUB_TEMP_DEVICE_CODE \ using MaxPolicyT = typename DispatchSegmentedSort::MaxPolicy; \ - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) \ - .doit(DeviceSegmentedSortContinuationKernel, \ - large_kernel, \ - small_kernel, \ - num_segments, \ - d_keys.Current(), \ - GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_keys), \ - d_keys_double_buffer, \ - d_values.Current(), \ - GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_values), \ - d_values_double_buffer, \ - d_begin_offsets, \ - d_end_offsets, \ - group_sizes.get(), \ - large_and_medium_segments_indices.get(), \ - small_segments_indices.get(), \ - debug_synchronous); \ + error = \ + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) \ + .doit(DeviceSegmentedSortContinuationKernel, \ + large_kernel, \ + small_kernel, \ + num_segments, \ + d_keys.Current(), \ + GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_keys), \ + d_keys_double_buffer, \ + d_values.Current(), \ + GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_values), \ + d_values_double_buffer, \ + d_begin_offsets, \ + d_end_offsets, \ + group_sizes.get(), \ + large_and_medium_segments_indices.get(), \ + small_segments_indices.get(), \ + debug_synchronous); \ \ - if (CubDebug(error = cudaPeekAtLastError())) \ + if (CubDebug(error)) \ { \ return error; \ } \ \ - if (debug_synchronous) \ + error = detail::DebugSyncStream(stream, debug_synchronous); \ + if (CubDebug(error)) \ { \ - if (CubDebug(error = SyncStream(stream))) \ - { \ - return error; \ - } \ + return error; \ } -#else -#define CUB_TMP_DEVICE_CODE error = CubDebug(cudaErrorNotSupported); -#endif + +#endif // CUB_RDC_ENABLED // Clang format mangles some of this NV_IF_TARGET block // clang-format off @@ -1677,10 +1677,10 @@ private: stream, debug_synchronous);), // NV_IS_DEVICE: - (CUB_TMP_DEVICE_CODE)); + (CUB_TEMP_DEVICE_CODE)); // clang-format on -#undef CUB_TMP_DEVICE_CODE +#undef CUB_TEMP_DEVICE_CODE return error; } @@ -1732,12 +1732,10 @@ private: } // Sync the stream if specified to flush runtime errors - if (debug_synchronous) + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = SyncStream(stream))) - { - return error; - } + return error; } return error; diff --git a/cub/device/dispatch/dispatch_select_if.cuh b/cub/device/dispatch/dispatch_select_if.cuh index fb949e6305..15e3e09a0d 100644 --- a/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/device/dispatch/dispatch_select_if.cuh @@ -254,28 +254,6 @@ struct DispatchSelectIf SelectIfKernelPtrT select_if_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel KernelConfig select_if_config) ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for { - -#ifndef CUB_RUNTIME_ENABLED - (void)d_temp_storage; - (void)temp_storage_bytes; - (void)d_in; - (void)d_flags; - (void)d_selected_out; - (void)d_num_selected_out; - (void)select_op; - (void)equality_op; - (void)num_items; - (void)stream; - (void)debug_synchronous; - (void)scan_init_kernel; - (void)select_if_kernel; - (void)select_if_config; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - -#else - cudaError error = cudaSuccess; do { @@ -317,10 +295,17 @@ struct DispatchSelectIf d_num_selected_out); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Return if empty problem if (num_items == 0) @@ -374,16 +359,21 @@ struct DispatchSelectIf num_tiles); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } while (0); return error; - -#endif // CUB_RUNTIME_ENABLED } diff --git a/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/device/dispatch/dispatch_spmv_orig.cuh index 7e2d74b26d..2c88059eec 100644 --- a/cub/device/dispatch/dispatch_spmv_orig.cuh +++ b/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -483,12 +483,6 @@ struct DispatchSpmv KernelConfig spmv_config, ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for KernelConfig segment_fixup_config) ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for { -#ifndef CUB_RUNTIME_ENABLED - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); - -#else cudaError error = cudaSuccess; do { @@ -531,10 +525,17 @@ struct DispatchSpmv spmv_params); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } break; } @@ -639,7 +640,11 @@ struct DispatchSpmv if (CubDebug(error = cudaPeekAtLastError())) break; // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } // Log spmv_kernel configuration @@ -661,7 +666,11 @@ struct DispatchSpmv if (CubDebug(error = cudaPeekAtLastError())) break; // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } // Run reduce-by-key fixup if necessary if (num_merge_tiles > 1) @@ -685,14 +694,16 @@ struct DispatchSpmv if (CubDebug(error = cudaPeekAtLastError())) break; // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) + { + break; + } } } while (0); return error; - -#endif // CUB_RUNTIME_ENABLED } diff --git a/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/device/dispatch/dispatch_three_way_partition.cuh index cb06438063..f643d83980 100644 --- a/cub/device/dispatch/dispatch_three_way_partition.cuh +++ b/cub/device/dispatch/dispatch_three_way_partition.cuh @@ -360,12 +360,10 @@ struct DispatchThreeWayPartitionIf } // Sync the stream if specified to flush runtime errors - if (debug_synchronous) + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = cub::SyncStream(stream))) - { - break; - } + break; } // Get max x-dimension of grid @@ -430,12 +428,10 @@ struct DispatchThreeWayPartitionIf } // Sync the stream if specified to flush runtime errors - if (debug_synchronous) + error = detail::DebugSyncStream(stream, debug_synchronous); + if (CubDebug(error)) { - if (CubDebug(error = cub::SyncStream(stream))) - { - break; - } + break; } } while (0); diff --git a/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/device/dispatch/dispatch_unique_by_key.cuh index a40e8bef4f..9eb96f0cb9 100644 --- a/cub/device/dispatch/dispatch_unique_by_key.cuh +++ b/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -220,16 +220,6 @@ struct DispatchUniqueByKey: SelectedPolicy CUB_RUNTIME_FUNCTION __host__ __forceinline__ cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel) { -#ifndef CUB_RUNTIME_ENABLED - - (void)init_kernel; - (void)scan_kernel; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - -#else - using Policy = typename ActivePolicyT::UniqueByKeyPolicyT; using UniqueByKeyAgentT = AgentUniqueByKey diff --git a/cub/util_arch.cuh b/cub/util_arch.cuh index 2c4b36bee4..f4013de568 100644 --- a/cub/util_arch.cuh +++ b/cub/util_arch.cuh @@ -33,9 +33,12 @@ #pragma once -#include "util_cpp_dialect.cuh" -#include "util_namespace.cuh" -#include "util_macro.cuh" +#include +#include +#include + +// Legacy include; this functionality used to be defined in here. +#include CUB_NAMESPACE_BEGIN @@ -96,16 +99,6 @@ CUB_NAMESPACE_BEGIN static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0."); -/// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. -#ifndef CUB_RUNTIME_FUNCTION - #if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__) - #define CUB_RUNTIME_ENABLED - #define CUB_RUNTIME_FUNCTION __host__ __device__ - #else - #define CUB_RUNTIME_FUNCTION __host__ - #endif -#endif - /// Number of threads per warp #ifndef CUB_LOG_WARP_THREADS diff --git a/cub/util_debug.cuh b/cub/util_debug.cuh index c38b9f8c06..b4a76a4e7d 100644 --- a/cub/util_debug.cuh +++ b/cub/util_debug.cuh @@ -67,14 +67,9 @@ __host__ __device__ __forceinline__ cudaError_t Debug(cudaError_t error, const char *filename, int line) { - (void)filename; - (void)line; - -#ifdef CUB_RUNTIME_ENABLED // Clear the global CUDA error state which may have been set by the last // call. Otherwise, errors may "leak" to unrelated kernel launches. cudaGetLastError(); -#endif #ifdef CUB_STDERR if (error) @@ -103,6 +98,9 @@ cudaError_t Debug(cudaError_t error, const char *filename, int line) ) ); } +#else + (void)filename; + (void)line; #endif return error; diff --git a/cub/util_device.cuh b/cub/util_device.cuh index 9a8ffa1d06..ad145f6b97 100644 --- a/cub/util_device.cuh +++ b/cub/util_device.cuh @@ -34,7 +34,6 @@ #pragma once #include - #include #include #include @@ -121,17 +120,9 @@ __global__ void EmptyKernel(void) { } */ CUB_RUNTIME_FUNCTION inline int CurrentDevice() { -#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime. - int device = -1; if (CubDebug(cudaGetDevice(&device))) return -1; return device; - -#else // Device code without the CUDA runtime. - - return -1; - -#endif } /** @@ -165,8 +156,6 @@ public: */ CUB_RUNTIME_FUNCTION inline int DeviceCountUncached() { -#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime. - int count = -1; if (CubDebug(cudaGetDeviceCount(&count))) // CUDA makes no guarantees about the state of the output parameter if @@ -174,12 +163,6 @@ CUB_RUNTIME_FUNCTION inline int DeviceCountUncached() // paranoia we'll reset `count` to `-1`. count = -1; return count; - -#else // Device code without the CUDA runtime. - - return -1; - -#endif } /** @@ -471,8 +454,6 @@ CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int &ptx_version) */ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice()) { -#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime. - cudaError_t error = cudaSuccess; do { @@ -484,16 +465,6 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int d while (0); return error; - -#else // Device code without the CUDA runtime. - - (void)sm_version; - (void)device; - - // CUDA API calls are not supported from this device. - return CubDebug(cudaErrorInvalidConfiguration); - -#endif } /** @@ -543,9 +514,47 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream) ((void)stream; result = CubDebug(cub::detail::device_synchronize());)); - return result; + return result; } +namespace detail +{ + +/** + * Same as SyncStream, but intended for use with the debug_synchronous flags + * in device algorithms. This should not be used if synchronization is required + * for correctness. + * + * If `debug_synchronous` is false, this function will immediately return + * cudaSuccess. If true, one of the following will occur: + * + * If synchronization is supported by the current compilation target and + * settings, the sync is performed and the sync result is returned. + * + * If syncs are not supported then no sync is performed, but a message is logged + * via _CubLog and cudaSuccess is returned. + */ +CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream, + bool debug_synchronous) +{ + if (!debug_synchronous) + { + return cudaSuccess; + } + +#if 1 // All valid targets currently support device-side synchronization + _CubLog("%s\n", "Synchronizing..."); + return SyncStream(stream); +#else + (void)stream; + _CubLog("%s\n", + "WARNING: Skipping CUB `debug_synchronous` synchronization " + "(unsupported target)."); + return cudaSuccess; +#endif +} + +} // namespace detail /** * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block. @@ -586,25 +595,11 @@ cudaError_t MaxSmOccupancy( int block_threads, ///< [in] Number of threads per thread block int dynamic_smem_bytes = 0) ///< [in] Dynamically allocated shared memory in bytes. Default is 0. { -#ifndef CUB_RUNTIME_ENABLED - - (void)dynamic_smem_bytes; - (void)block_threads; - (void)kernel_ptr; - (void)max_sm_occupancy; - - // CUDA API calls not supported from this device - return CubDebug(cudaErrorInvalidConfiguration); - -#else - return CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_sm_occupancy, kernel_ptr, block_threads, dynamic_smem_bytes)); - -#endif // CUB_RUNTIME_ENABLED } diff --git a/examples/device/example_device_reduce.cu b/examples/device/example_device_reduce.cu index fc8fddb0e2..c3182579d6 100644 --- a/examples/device/example_device_reduce.cu +++ b/examples/device/example_device_reduce.cu @@ -134,7 +134,7 @@ int main(int argc, char** argv) // Allocate host arrays int* h_in = new int[num_items]; - int h_reference; + int h_reference{}; // Initialize problem and solution Initialize(h_in, num_items); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 902103f199..bea1d90771 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -23,6 +23,7 @@ function(cub_get_test_params src labels_var defs_var) unused "${match}" ) + set(def ${CMAKE_MATCH_1}) set(label ${CMAKE_MATCH_2}) set(values "${CMAKE_MATCH_3}") @@ -57,7 +58,6 @@ function(cub_get_test_params src labels_var defs_var) endforeach() set(variant_defs "${tmp_defs}") endif() - endforeach() set(${labels_var} "${variant_labels}" PARENT_SCOPE) @@ -115,15 +115,33 @@ function(cub_add_test target_name_var test_name test_src cub_target) endif() add_dependencies(${test_meta_target} ${test_target}) - if (CUB_ENABLE_TESTS_WITH_RDC) - cub_enable_rdc_for_cuda_target(${test_target}) - endif() - add_test(NAME ${test_target} COMMAND "$" ) endfunction() +# Sets out_var to 1 if the label contains cdp variants, regardless of whether +# or not CDP is enabled in this particular variant. +function(_cub_has_cdp_variant out_var label) + string(FIND "${label}" "cdp_" idx) + if (idx EQUAL -1) + set(${out_var} 0 PARENT_SCOPE) + else() + set(${out_var} 1 PARENT_SCOPE) + endif() +endfunction() + +# Sets out_var to 1 if the label contains "cdp_1", e.g. cdp is explicitly +# requested for this variant. +function(_cub_is_cdp_enabled_variant out_var label) + string(FIND "${label}" "cdp_1" idx) + if (idx EQUAL -1) + set(${out_var} 0 PARENT_SCOPE) + else() + set(${out_var} 1 PARENT_SCOPE) + endif() +endfunction() + foreach (test_src IN LISTS test_srcs) get_filename_component(test_name "${test_src}" NAME_WE) string(REGEX REPLACE "^test_" "" test_name "${test_name}") @@ -135,7 +153,7 @@ foreach (test_src IN LISTS test_srcs) math(EXPR range_end "${num_variants} - 1") # Verbose output: - if (NOT num_variants EQUAL 0) + if (num_variants GREATER 0) message(VERBOSE "Detected ${num_variants} variants of test '${test_src}':") foreach(var_idx RANGE ${range_end}) math(EXPR i "${var_idx} + 1") @@ -151,11 +169,16 @@ foreach (test_src IN LISTS test_srcs) if (num_variants EQUAL 0) # Only one version of this test. cub_add_test(test_target ${test_name} "${test_src}" ${cub_target}) - else() + if (CUB_ENABLE_TESTS_WITH_RDC) + cub_enable_rdc_for_cuda_target(${test_target}) + endif() + else() # has variants: # Meta target to build all parametrizations of the current test for the # current CUB_TARGET config set(variant_meta_target ${config_prefix}.test.${test_name}.all) - add_custom_target(${variant_meta_target}) + if (NOT TARGET ${variant_meta_target}) + add_custom_target(${variant_meta_target}) + endif() # Meta target to build all parametrizations of the current test for all # CUB_TARGET configs @@ -171,6 +194,10 @@ foreach (test_src IN LISTS test_srcs) list(GET variant_defs ${var_idx} defs) string(REPLACE ":" ";" defs "${defs}") + # Check if the test has explicit CDP variants: + _cub_has_cdp_variant(explicit_cdp "${label}") + _cub_is_cdp_enabled_variant(enable_cdp "${label}") + cub_add_test(test_target ${test_name}.${label} "${test_src}" @@ -179,9 +206,20 @@ foreach (test_src IN LISTS test_srcs) add_dependencies(${variant_meta_target} ${test_target}) add_dependencies(${cub_variant_meta_target} ${test_target}) target_compile_definitions(${test_target} PRIVATE ${defs}) - endforeach() - endif() - endforeach() -endforeach() + + # Enable RDC if the test either: + # 1. Explicitly requests it (cdp_1 label) + # 2. Does not have an explicit CDP variant (no cdp_0 or cdp_1) but + # RDC testing is globally enabled. + # + # Tests that explicitly request no cdp (cdp_0 label) should never enable + # RDC. + if (enable_cdp OR ((NOT explicit_cdp) AND CUB_ENABLE_TESTS_WITH_RDC)) + cub_enable_rdc_for_cuda_target(${test_target}) + endif() + endforeach() # Variant + endif() # Has variants + endforeach() # CUB targets +endforeach() # Source file add_subdirectory(cmake) diff --git a/test/README.md b/test/README.md index 3f14c38520..81891f495f 100644 --- a/test/README.md +++ b/test/README.md @@ -29,26 +29,38 @@ generate multiple test executables for the full cartesian product of values. - `values` is a colon-separated list of values used during test generation. Only numeric values have been tested. +## Special Labels + +### CDP / RDC Testing + +If a `label` is `cdp`, it is assumed that the parameter is used to explicitly +test variants built with and without CDP support. The `values` for such a +parameter must be `0:1`, with `0` indicating CDP disabled (RDC off) and `1` +indicating CDP enabled (RDC on). + +Tests that do not contain a variant labeled `cdp` will only enable RDC if +the CMake variable `CUB_ENABLE_TESTS_WITH_RDC` is true. + ## Example For example, if `test_baz.cu` contains the following lines: ```cpp // %PARAM% TEST_FOO foo 0:1:2 -// %PARAM% TEST_BAR bar 4:8 +// %PARAM% TEST_CDP cdp 0:1 ``` Six executables and CTest targets will be generated with unique definitions (only c++17 targets shown): -| Executable Name | Preprocessor Definitions | -|----------------------------------|-----------------------------| -| `cub.cpp17.test.baz.foo_0.bar_4` | `-DTEST_FOO=0 -DTEST_BAR=4` | -| `cub.cpp17.test.baz.foo_0.bar_8` | `-DTEST_FOO=0 -DTEST_BAR=8` | -| `cub.cpp17.test.baz.foo_1.bar_4` | `-DTEST_FOO=1 -DTEST_BAR=4` | -| `cub.cpp17.test.baz.foo_1.bar_8` | `-DTEST_FOO=1 -DTEST_BAR=8` | -| `cub.cpp17.test.baz.foo_2.bar_4` | `-DTEST_FOO=2 -DTEST_BAR=4` | -| `cub.cpp17.test.baz.foo_2.bar_8` | `-DTEST_FOO=2 -DTEST_BAR=8` | +| Executable Name | Preprocessor Definitions | RDC State | +|----------------------------------|-----------------------------|-----------| +| `cub.cpp17.test.baz.foo_0.cdp_0` | `-DTEST_FOO=0 -DTEST_CDP=0` | Disabled | +| `cub.cpp17.test.baz.foo_0.cdp_1` | `-DTEST_FOO=0 -DTEST_CDP=1` | Enabled | +| `cub.cpp17.test.baz.foo_1.cdp_0` | `-DTEST_FOO=1 -DTEST_CDP=0` | Disabled | +| `cub.cpp17.test.baz.foo_1.cdp_1` | `-DTEST_FOO=1 -DTEST_CDP=1` | Enabled | +| `cub.cpp17.test.baz.foo_2.cdp_0` | `-DTEST_FOO=2 -DTEST_CDP=0` | Disabled | +| `cub.cpp17.test.baz.foo_2.cdp_1` | `-DTEST_FOO=2 -DTEST_CDP=1` | Enabled | ## Changing `%PARAM%` Hints diff --git a/test/cmake/check_source_files.cmake b/test/cmake/check_source_files.cmake index 1fba8476f6..1554a2256e 100644 --- a/test/cmake/check_source_files.cmake +++ b/test/cmake/check_source_files.cmake @@ -83,6 +83,24 @@ if (NOT valid_count EQUAL 5) "Matched ${valid_count} times, expected 5.") endif() +################################################################################ +# Legacy macro checks. +# Check all files in CUB to make sure that they aren't using the legacy +# CUB_RUNTIME_ENABLED and __THRUST_HAS_CUDART__ macros. +# +# These macros depend on __CUDA_ARCH__ and are not compatible with NV_IF_TARGET. +# They are provided for legacy purposes and should be replaced with +# [THRUST|CUB]_RDC_ENABLED and NV_IF_TARGET in Thrust/CUB code. +# +# +set(legacy_macro_header_exclusions + # This header defines a legacy CUDART macro: + cub/detail/detect_cuda_runtime.cuh +) + +set(cub_legacy_macro_regex "CUB_RUNTIME_ENABLED") +set(thrust_legacy_macro_regex "__THRUST_HAS_CUDART__") + ################################################################################ # Read source files: foreach(src ${cub_srcs}) @@ -138,6 +156,21 @@ foreach(src ${cub_srcs}) set(found_errors 1) endif() endif() + + if (NOT ${src} IN_LIST legacy_macro_header_exclusions) + count_substrings("${src_contents}" "${thrust_legacy_macro_regex}" thrust_count) + count_substrings("${src_contents}" "${cub_legacy_macro_regex}" cub_count) + + if (NOT thrust_count EQUAL 0) + message("'${src}' uses __THRUST_HAS_CUDART__. Replace with THRUST_RDC_ENABLED and NV_IF_TARGET.") + set(found_errors 1) + endif() + + if (NOT cub_count EQUAL 0) + message("'${src}' uses CUB_RUNTIME_ENABLED. Replace with CUB_RDC_ENABLED and NV_IF_TARGET.") + set(found_errors 1) + endif() + endif() endforeach() if (NOT found_errors EQUAL 0) diff --git a/test/test_allocator.cu b/test/test_allocator.cu index eabfa99b46..fcd488b8ef 100644 --- a/test/test_allocator.cu +++ b/test/test_allocator.cu @@ -324,7 +324,6 @@ int main(int argc, char** argv) // Check that that still we have 0 live block across all GPUs AssertEquals(allocator.live_blocks.size(), 0); -#ifndef CUB_CDP // BUG: find out why these tests fail when one GPU is CDP compliant and the other is not if (num_gpus > 1) @@ -361,7 +360,6 @@ int main(int argc, char** argv) // Check that that still we have 0 live block across all GPUs AssertEquals(allocator.live_blocks.size(), 0); } -#endif // CUB_CDP // // Performance diff --git a/test/test_cdp_variant_state.cu b/test/test_cdp_variant_state.cu new file mode 100644 index 0000000000..2a34842786 --- /dev/null +++ b/test/test_cdp_variant_state.cu @@ -0,0 +1,34 @@ +/* +* Copyright 2022 NVIDIA Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include + +#include + +int main() +{ + // This test just checks that RDC is enabled and detected properly when using + // the %PARAM% system to request CDP support (see the README.md file in + // this directory). + + // %PARAM% TEST_CDP cdp 0:1 + +#ifdef CUB_RDC_ENABLED + return (TEST_CDP == 1) ? EXIT_SUCCESS : EXIT_FAILURE; +#else + return (TEST_CDP == 0) ? EXIT_SUCCESS : EXIT_FAILURE; +#endif +} diff --git a/test/test_device_histogram.cu b/test/test_device_histogram.cu index e0a6ae04d5..0faf67cbe3 100644 --- a/test/test_device_histogram.cu +++ b/test/test_device_histogram.cu @@ -433,77 +433,6 @@ struct Dispatch<1, 1, CUB> }; - -//--------------------------------------------------------------------- -// CUDA nested-parallelism test kernel -//--------------------------------------------------------------------- - -/** - * Simple wrapper kernel to invoke DeviceHistogram - * / -template -__global__ void CnpDispatchKernel( - Int2Type algorithm, - int timing_timing_iterations, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - - void* d_temp_storage, - size_t temp_storage_bytes, - SampleT *d_samples, - SampleIteratorT d_sample_itr, - ArrayWrapper d_out_histograms, - int num_samples, - bool debug_synchronous) -{ -#ifndef CUB_CDP - *d_cdp_error = cudaErrorNotSupported; -#else - *d_cdp_error = Dispatch(algorithm, Int2Type(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_out_histograms.array, num_samples, 0, debug_synchronous); - *d_temp_storage_bytes = temp_storage_bytes; -#endif -} - - -/ ** - * Dispatch to CDP kernel - * / -template -cudaError_t Dispatch( - Int2Type algorithm, - Int2Type use_cdp, - int timing_timing_iterations, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - - void* d_temp_storage, - size_t& temp_storage_bytes, - SampleT *d_samples, - SampleIteratorT d_sample_itr, - CounterT *d_histograms[NUM_ACTIVE_CHANNELS], - int num_samples, - cudaStream_t stream, - bool debug_synchronous) -{ - // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters) - ArrayWrapper d_histo_wrapper; - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL]; - - // Invoke kernel to invoke device-side dispatch - CnpDispatchKernel<<<1,1>>>(algorithm, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_histo_wrapper, num_samples, debug_synchronous); - - // Copy out temp_storage_bytes - CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); - - // Copy out error - cudaError_t retval; - CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); - return retval; -} -*/ - - //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- diff --git a/test/test_device_radix_sort.cu b/test/test_device_radix_sort.cu index d9a930b3a1..b9e7cb70fb 100644 --- a/test/test_device_radix_sort.cu +++ b/test/test_device_radix_sort.cu @@ -53,10 +53,13 @@ #include #include +#include #include #include #include +#include + #include "test_util.h" using namespace cub; @@ -80,7 +83,11 @@ enum Backend CUB_SEGMENTED, // CUB method (allows overwriting of input) CUB_SEGMENTED_NO_OVERWRITE, // CUB method (disallows overwriting of input) - CDP, // GPU-based (dynamic parallelism) dispatch to CUB method + // Same as above, but launches kernels from device using CDP. + CDP, + CDP_NO_OVERWRITE, + CDP_SEGMENTED, + CDP_SEGMENTED_NO_OVERWRITE, }; static const char* BackendToString(Backend b) @@ -97,6 +104,12 @@ static const char* BackendToString(Backend b) return "CUB_SEGMENTED_NO_OVERWRITE"; case CDP: return "CDP"; + case CDP_NO_OVERWRITE: + return "CDP_NO_OVERWRITE"; + case CDP_SEGMENTED: + return "CDP_SEGMENTED"; + case CDP_SEGMENTED_NO_OVERWRITE: + return "CDP_SEGMENTED_NO_OVERWRITE"; default: break; } @@ -114,7 +127,6 @@ static const char* BackendToString(Backend b) template CUB_RUNTIME_FUNCTION -__forceinline__ cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, @@ -147,7 +159,6 @@ cudaError_t Dispatch( template CUB_RUNTIME_FUNCTION -__forceinline__ cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, @@ -187,7 +198,6 @@ cudaError_t Dispatch( template CUB_RUNTIME_FUNCTION -__forceinline__ cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, @@ -221,7 +231,6 @@ cudaError_t Dispatch( template CUB_RUNTIME_FUNCTION -__forceinline__ cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, @@ -286,7 +295,6 @@ __host__ __device__ bool ValidateNumItemsForSegmentedSort(NumItemsT num_items) template CUB_RUNTIME_FUNCTION -__forceinline__ cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, @@ -332,7 +340,6 @@ cudaError_t Dispatch( template CUB_RUNTIME_FUNCTION -__forceinline__ cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, @@ -389,7 +396,6 @@ cudaError_t Dispatch( template CUB_RUNTIME_FUNCTION -__forceinline__ cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, @@ -436,7 +442,6 @@ cudaError_t Dispatch( template CUB_RUNTIME_FUNCTION -__forceinline__ cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, @@ -490,104 +495,195 @@ cudaError_t Dispatch( // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- +#if TEST_CDP == 1 + /** * Simple wrapper kernel to invoke DeviceRadixSort */ -template -__global__ void CnpDispatchKernel( - Int2Type is_descending, - int *d_selector, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - - void *d_temp_storage, - size_t temp_storage_bytes, - DoubleBuffer d_keys, - DoubleBuffer d_values, - NumItemsT num_items, - int num_segments, - BeginOffsetIteratorT d_segment_begin_offsets, - EndOffsetIteratorT d_segment_end_offsets, - int begin_bit, - int end_bit, - bool debug_synchronous) +template +__global__ void CDPDispatchKernel(Int2Type is_descending, + Int2Type cub_backend, + int *d_selector, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + void *d_temp_storage, + size_t temp_storage_bytes, + DoubleBuffer d_keys, + DoubleBuffer d_values, + NumItemsT num_items, + int num_segments, + BeginOffsetIteratorT d_segment_begin_offsets, + EndOffsetIteratorT d_segment_end_offsets, + int begin_bit, + int end_bit, + bool debug_synchronous) { -#ifndef CUB_CDP - (void)is_descending; - (void)d_selector; - (void)d_temp_storage_bytes; - (void)d_cdp_error; - (void)d_temp_storage; - (void)temp_storage_bytes; - (void)d_keys; - (void)d_values; - (void)num_items; - (void)num_segments; - (void)d_segment_begin_offsets; - (void)d_segment_end_offsets; - (void)begin_bit; - (void)end_bit; - (void)debug_synchronous; - *d_cdp_error = cudaErrorNotSupported; -#else - *d_cdp_error = Dispatch( - is_descending, Int2Type(), d_selector, d_temp_storage_bytes, d_cdp_error, - d_temp_storage, temp_storage_bytes, d_keys, d_values, - num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, - begin_bit, end_bit, 0, debug_synchronous); - *d_temp_storage_bytes = temp_storage_bytes; - *d_selector = d_keys.selector; -#endif + *d_cdp_error = Dispatch(is_descending, + cub_backend, + d_selector, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_segment_begin_offsets, + d_segment_end_offsets, + begin_bit, + end_bit, + 0, + debug_synchronous); + + *d_temp_storage_bytes = temp_storage_bytes; + *d_selector = d_keys.selector; } - /** - * Dispatch to CDP kernel + * Launch kernel and dispatch on device. Should only be called from host code. + * The CubBackend should be one of the non-CDP CUB backends to invoke from the + * device. */ -template -cudaError_t Dispatch( - Int2Type is_descending, - Int2Type dispatch_to, - int *d_selector, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - - void *d_temp_storage, - size_t &temp_storage_bytes, - DoubleBuffer &d_keys, - DoubleBuffer &d_values, - NumItemsT num_items, - int num_segments, - BeginOffsetIteratorT d_segment_begin_offsets, - EndOffsetIteratorT d_segment_end_offsets, - int begin_bit, - int end_bit, - cudaStream_t stream, - bool debug_synchronous) +template +cudaError_t LaunchCDPKernel(Int2Type is_descending, + Int2Type cub_backend, + int *d_selector, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + + void *d_temp_storage, + size_t &temp_storage_bytes, + DoubleBuffer &d_keys, + DoubleBuffer &d_values, + NumItemsT num_items, + int num_segments, + BeginOffsetIteratorT d_segment_begin_offsets, + EndOffsetIteratorT d_segment_end_offsets, + int begin_bit, + int end_bit, + cudaStream_t stream, + bool debug_synchronous) { - // Invoke kernel to invoke device-side dispatch - CnpDispatchKernel<<<1,1>>>( - is_descending, d_selector, d_temp_storage_bytes, d_cdp_error, - d_temp_storage, temp_storage_bytes, d_keys, d_values, - num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, - begin_bit, end_bit, debug_synchronous); + // Invoke kernel to invoke device-side dispatch: + cudaError_t retval = + thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) + .doit(CDPDispatchKernel, + is_descending, + cub_backend, + d_selector, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_segment_begin_offsets, + d_segment_end_offsets, + begin_bit, + end_bit, + debug_synchronous); + CubDebugExit(retval); + CubDebugExit(cub::detail::device_synchronize()); + + // Copy out selector + CubDebugExit(cudaMemcpy(&d_keys.selector, + d_selector, + sizeof(int) * 1, + cudaMemcpyDeviceToHost)); + d_values.selector = d_keys.selector; + + // Copy out temp_storage_bytes + CubDebugExit(cudaMemcpy(&temp_storage_bytes, + d_temp_storage_bytes, + sizeof(size_t) * 1, + cudaMemcpyDeviceToHost)); + + // Copy out error + CubDebugExit(cudaMemcpy(&retval, + d_cdp_error, + sizeof(cudaError_t) * 1, + cudaMemcpyDeviceToHost)); + + return retval; +} - // Copy out selector - CubDebugExit(cudaMemcpy(&d_keys.selector, d_selector, sizeof(int) * 1, cudaMemcpyDeviceToHost)); - d_values.selector = d_keys.selector; +// Specializations of Dispatch that translate the CDP backend to the appropriate +// CUB backend, and uses the CUB backend to launch the CDP kernel. +#define DEFINE_CDP_DISPATCHER(CdpBackend, CubBackend) \ + template \ + cudaError_t Dispatch(Int2Type is_descending, \ + Int2Type /*dispatch_to*/, \ + int *d_selector, \ + size_t *d_temp_storage_bytes, \ + cudaError_t *d_cdp_error, \ + \ + void *d_temp_storage, \ + size_t &temp_storage_bytes, \ + DoubleBuffer &d_keys, \ + DoubleBuffer &d_values, \ + NumItemsT num_items, \ + int num_segments, \ + BeginOffsetIteratorT d_segment_begin_offsets, \ + EndOffsetIteratorT d_segment_end_offsets, \ + int begin_bit, \ + int end_bit, \ + cudaStream_t stream, \ + bool debug_synchronous) \ + { \ + Int2Type cub_backend{}; \ + return LaunchCDPKernel(is_descending, \ + cub_backend, \ + d_selector, \ + d_temp_storage_bytes, \ + d_cdp_error, \ + d_temp_storage, \ + temp_storage_bytes, \ + d_keys, \ + d_values, \ + num_items, \ + num_segments, \ + d_segment_begin_offsets, \ + d_segment_end_offsets, \ + begin_bit, \ + end_bit, \ + stream, \ + debug_synchronous); \ + } - // Copy out temp_storage_bytes - CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); - - // Copy out error - cudaError_t retval; - CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); - return retval; -} +DEFINE_CDP_DISPATCHER(CDP, CUB) +DEFINE_CDP_DISPATCHER(CDP_NO_OVERWRITE, CUB_NO_OVERWRITE) +DEFINE_CDP_DISPATCHER(CDP_SEGMENTED, CUB_SEGMENTED) +DEFINE_CDP_DISPATCHER(CDP_SEGMENTED_NO_OVERWRITE, CUB_SEGMENTED_NO_OVERWRITE) +#undef DEFINE_CDP_DISPATCHER +#endif // TEST_CDP //--------------------------------------------------------------------- // Problem generation @@ -721,6 +817,13 @@ void InitializeSolution( NumItemsT *&h_reference_ranks, KeyT *&h_reference_keys) { + if (num_items == 0) + { + h_reference_ranks = nullptr; + h_reference_keys = nullptr; + return; + } + if (pre_sorted) { printf("Shuffling reference solution on CPU\n"); @@ -787,7 +890,9 @@ void InitializeSolution( { h_reference_ranks = new NumItemsT[num_items]; } - NumItemsT max_run = 32, run = 0, i = 0; + NumItemsT max_run = 32; + NumItemsT run = 0; + NumItemsT i = 0; while (summary.size() > 0) { // Pick up a random element and a run. @@ -842,7 +947,12 @@ void InitializeSolution( h_pairs[i].value = i; } - printf("\nSorting reference solution on CPU (%d segments)...", num_segments); fflush(stdout); + printf("\nSorting reference solution on CPU " + "(%zd items, %d segments, %zd items/seg)...", + static_cast(num_items), + num_segments, + static_cast(num_items / num_segments)); + fflush(stdout); for (int i = 0; i < num_segments; ++i) { @@ -945,8 +1055,8 @@ void Test( const bool KEYS_ONLY = std::is_same::value; printf("%s %s cub::DeviceRadixSort %zd items, %d segments, " - "%d-byte keys (%s) %d-byte values (%s), descending %d, " - "begin_bit %d, end_bit %d\n", + "%d-byte keys (%s) %d-byte values (%s), %d-byte num_items (%s), " + "descending %d, begin_bit %d, end_bit %d\n", BackendToString(BACKEND), (KEYS_ONLY) ? "keys-only" : "key-value", static_cast(num_items), @@ -955,6 +1065,8 @@ void Test( typeid(KeyT).name(), (KEYS_ONLY) ? 0 : static_cast(sizeof(ValueT)), typeid(ValueT).name(), + static_cast(sizeof(NumItemsT)), + typeid(NumItemsT).name(), IS_DESCENDING, begin_bit, end_bit); @@ -1104,73 +1216,136 @@ bool HasEnoughMemory(std::size_t num_items, bool overwrite) return test_mem < total_mem; } + /** * Test backend */ -template -void TestBackend( - KeyT *h_keys, - NumItemsT num_items, - int num_segments, - BeginOffsetIteratorT d_segment_begin_offsets, - EndOffsetIteratorT d_segment_end_offsets, - int begin_bit, - int end_bit, - KeyT *h_reference_keys, - NumItemsT *h_reference_ranks) +template +void TestBackend(KeyT *h_keys, + NumItemsT num_items, + int num_segments, + BeginOffsetIteratorT d_segment_begin_offsets, + EndOffsetIteratorT d_segment_end_offsets, + int begin_bit, + int end_bit, + KeyT *h_reference_keys, + NumItemsT *h_reference_ranks) { - const bool KEYS_ONLY = std::is_same::value; +#if TEST_CDP == 0 + constexpr auto NonSegmentedOverwrite = CUB; + constexpr auto NonSegmentedNoOverwrite = CUB_NO_OVERWRITE; + constexpr auto SegmentedOverwrite = CUB_SEGMENTED; + constexpr auto SegmentedNoOverwrite = CUB_SEGMENTED_NO_OVERWRITE; +#else // TEST_CDP + constexpr auto NonSegmentedOverwrite = CDP; + constexpr auto NonSegmentedNoOverwrite = CDP_NO_OVERWRITE; + constexpr auto SegmentedOverwrite = CDP_SEGMENTED; + constexpr auto SegmentedNoOverwrite = CDP_SEGMENTED_NO_OVERWRITE; +#endif // TEST_CDP + + const bool KEYS_ONLY = std::is_same::value; + + // A conservative check assuming overwrite is allowed. + if (!HasEnoughMemory(static_cast(num_items), true)) + { + printf("Skipping the test due to insufficient device memory\n"); + return; + } - // A conservative check assuming overwrite is allowed. - if (!HasEnoughMemory(static_cast(num_items), true)) - { - printf("Skipping the test due to insufficient device memory\n"); - return; - } + std::unique_ptr h_value_data{}; - ValueT *h_values = NULL; - ValueT *h_reference_values = NULL; - - if (!KEYS_ONLY) - { - h_values = new ValueT[num_items]; - h_reference_values = new ValueT[num_items]; + ValueT *h_values = nullptr; + ValueT *h_reference_values = nullptr; - for (NumItemsT i = 0; i < num_items; ++i) - { - InitValue(INTEGER_SEED, h_values[i], i); - InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]); - } - } + if (!KEYS_ONLY) + { + h_value_data.reset(new ValueT[2 * static_cast(num_items)]); + h_values = h_value_data.get(); + h_reference_values = h_value_data.get() + num_items; - // Skip segmented sort if num_items isn't int. - // TODO(canonizer): re-enable these tests once num_items is templated for segmented sort. - if (std::is_same::value) + for (NumItemsT i = 0; i < num_items; ++i) { - Test( h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values); - Test( h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values); + InitValue(INTEGER_SEED, h_values[i], i); + InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]); } + } + + // Skip segmented sort if num_items isn't int. + // TODO(64bit-seg-sort): re-enable these tests once num_items is templated for + // segmented sort. + if (std::is_same::value) + { + printf("Testing segmented sort with overwrite\n"); + Test(h_keys, + h_values, + num_items, + num_segments, + d_segment_begin_offsets, + d_segment_end_offsets, + begin_bit, + end_bit, + h_reference_keys, + h_reference_values); + printf("Testing segmented sort with no overwrite\n"); + Test(h_keys, + h_values, + num_items, + num_segments, + d_segment_begin_offsets, + d_segment_end_offsets, + begin_bit, + end_bit, + h_reference_keys, + h_reference_values); + } + else + { + printf("Skipping segmented sort tests (NumItemsT != int)\n"); + } - if (num_segments == 1) + if (num_segments == 1) + { + printf("Testing non-segmented sort with overwrite\n"); + Test(h_keys, + h_values, + num_items, + num_segments, + d_segment_begin_offsets, + d_segment_end_offsets, + begin_bit, + end_bit, + h_reference_keys, + h_reference_values); + if (HasEnoughMemory(static_cast(num_items), + false)) { - Test( h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values); - if (HasEnoughMemory(static_cast(num_items), false)) - { - Test( h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values); - } - else - { - printf("Skipping CUB_NO_OVERWRITE tests due to insufficient memory\n"); - } - #ifdef CUB_CDP - Test( h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values); - #endif + printf("Testing non-segmented sort with no overwrite\n"); + Test(h_keys, + h_values, + num_items, + num_segments, + d_segment_begin_offsets, + d_segment_end_offsets, + begin_bit, + end_bit, + h_reference_keys, + h_reference_values); } - - if (h_values) delete[] h_values; - if (h_reference_values) delete[] h_reference_values; + else + { + printf("Skipping no-overwrite tests with %zd items due to " + "insufficient memory\n", + static_cast(num_items)); + } + } } + // Smallest value type for TEST_VALUE_TYPE. // Unless TEST_VALUE_TYPE == 3, this is the only value type tested. #if TEST_VALUE_TYPE == 0 @@ -1612,6 +1787,7 @@ int main(int argc, char** argv) // Initialize device CubDebugExit(args.DeviceInit()); + // %PARAM% TEST_CDP cdp 0:1 // %PARAM% TEST_KEY_BYTES bytes 1:2:4:8 // %PARAM% TEST_VALUE_TYPE pairs 0:1:2:3 // 0->Keys only diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu index 0599add52c..ef81c23c34 100644 --- a/test/test_device_reduce.cu +++ b/test/test_device_reduce.cu @@ -42,6 +42,10 @@ #include #include +#include + +#include + #include "test_util.h" #include @@ -70,9 +74,28 @@ enum Backend { CUB, // CUB method CUB_SEGMENTED, // CUB segmented method - CUB_CDP, // GPU-based (dynamic parallelism) dispatch to CUB method + CDP, // GPU-based (dynamic parallelism) dispatch to CUB method + CDP_SEGMENTED, // GPU-based segmented method }; +inline const char* BackendToString(Backend b) +{ + switch (b) + { + case CUB: + return "CUB"; + case CUB_SEGMENTED: + return "CUB_SEGMENTED"; + case CDP: + return "CDP"; + case CDP_SEGMENTED: + return "CDP_SEGMENTED"; + default: + break; + } + + return ""; +} // Custom max functor struct CustomMax @@ -94,7 +117,7 @@ struct CustomMax * Dispatch to reduce entrypoint (custom-max) */ template -CUB_RUNTIME_FUNCTION __forceinline__ +CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, @@ -137,7 +160,7 @@ cudaError_t Dispatch( * Dispatch to sum entrypoint */ template -CUB_RUNTIME_FUNCTION __forceinline__ +CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, @@ -170,7 +193,7 @@ cudaError_t Dispatch( * Dispatch to min entrypoint */ template -CUB_RUNTIME_FUNCTION __forceinline__ +CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, @@ -203,7 +226,7 @@ cudaError_t Dispatch( * Dispatch to max entrypoint */ template -CUB_RUNTIME_FUNCTION __forceinline__ +CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, @@ -236,7 +259,7 @@ cudaError_t Dispatch( * Dispatch to argmin entrypoint */ template -CUB_RUNTIME_FUNCTION __forceinline__ +CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, @@ -269,7 +292,7 @@ cudaError_t Dispatch( * Dispatch to argmax entrypoint */ template -CUB_RUNTIME_FUNCTION __forceinline__ +CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, @@ -307,7 +330,7 @@ cudaError_t Dispatch( * Dispatch to reduce entrypoint (custom-max) */ template -CUB_RUNTIME_FUNCTION __forceinline__ +CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, @@ -350,7 +373,7 @@ cudaError_t Dispatch( * Dispatch to sum entrypoint */ template -CUB_RUNTIME_FUNCTION __forceinline__ +CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, @@ -384,7 +407,7 @@ cudaError_t Dispatch( * Dispatch to min entrypoint */ template -CUB_RUNTIME_FUNCTION __forceinline__ +CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, @@ -418,7 +441,7 @@ cudaError_t Dispatch( * Dispatch to max entrypoint */ template -CUB_RUNTIME_FUNCTION __forceinline__ +CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, @@ -452,7 +475,7 @@ cudaError_t Dispatch( * Dispatch to argmin entrypoint */ template -CUB_RUNTIME_FUNCTION __forceinline__ +CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, @@ -486,7 +509,7 @@ cudaError_t Dispatch( * Dispatch to argmax entrypoint */ template -CUB_RUNTIME_FUNCTION __forceinline__ +CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, @@ -521,91 +544,169 @@ cudaError_t Dispatch( // CUDA nested-parallelism test kernel //--------------------------------------------------------------------- +#if TEST_CDP == 1 + /** * Simple wrapper kernel to invoke DeviceReduce */ -template < - typename InputIteratorT, - typename OutputIteratorT, - typename BeginOffsetIteratorT, - typename EndOffsetIteratorT, - typename ReductionOpT> -__global__ void CnpDispatchKernel( - int timing_iterations, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - - void* d_temp_storage, - size_t temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - int max_segments, - BeginOffsetIteratorT d_segment_begin_offsets, - EndOffsetIteratorT d_segment_end_offsets, - ReductionOpT reduction_op, - bool debug_synchronous) +template +__global__ void CDPDispatchKernel(Int2Type cub_backend, + int timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + + void *d_temp_storage, + size_t temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_items, + int max_segments, + BeginOffsetIteratorT d_segment_begin_offsets, + EndOffsetIteratorT d_segment_end_offsets, + ReductionOpT reduction_op, + bool debug_synchronous) { -#ifndef CUB_CDP - (void)timing_iterations; - (void)d_temp_storage_bytes; - (void)d_cdp_error; - (void)d_temp_storage; - (void)temp_storage_bytes; - (void)d_in; - (void)d_out; - (void)num_items; - (void)max_segments; - (void)d_segment_begin_offsets; - (void)d_segment_end_offsets; - (void)reduction_op; - (void)debug_synchronous; - *d_cdp_error = cudaErrorNotSupported; -#else - *d_cdp_error = Dispatch(Int2Type(), timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, - d_in, d_out, num_items, max_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, 0, debug_synchronous); - *d_temp_storage_bytes = temp_storage_bytes; -#endif + *d_cdp_error = Dispatch(cub_backend, + timing_iterations, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + max_segments, + d_segment_begin_offsets, + d_segment_end_offsets, + reduction_op, + 0, + debug_synchronous); + + *d_temp_storage_bytes = temp_storage_bytes; } - /** - * Dispatch to CUB_CDP kernel + * Launch kernel and dispatch on device. Should only be called from host code. + * The CubBackend should be one of the non-CDP CUB backends to invoke from the + * device. */ -template -CUB_RUNTIME_FUNCTION __forceinline__ -cudaError_t Dispatch( - Int2Type dispatch_to, - int timing_iterations, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - int max_segments, - BeginOffsetIteratorT d_segment_begin_offsets, - EndOffsetIteratorT d_segment_end_offsets, - ReductionOpT reduction_op, - cudaStream_t stream, - bool debug_synchronous) +template +cudaError_t LaunchCDPKernel(Int2Type cub_backend, + int timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + + void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_items, + int max_segments, + BeginOffsetIteratorT d_segment_begin_offsets, + EndOffsetIteratorT d_segment_end_offsets, + ReductionOpT reduction_op, + cudaStream_t stream, + bool debug_synchronous) { - // Invoke kernel to invoke device-side dispatch - CnpDispatchKernel<<<1,1>>>(timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, - d_in, d_out, num_items, max_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, debug_synchronous); - - // Copy out temp_storage_bytes - CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); - - // Copy out error - cudaError_t retval; - CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); - return retval; + cudaError_t retval = + thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) + .doit(CDPDispatchKernel, + cub_backend, + timing_iterations, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + max_segments, + d_segment_begin_offsets, + d_segment_end_offsets, + reduction_op, + debug_synchronous); + CubDebugExit(retval); + CubDebugExit(cub::detail::device_synchronize()); + + // Copy out temp_storage_bytes + CubDebugExit(cudaMemcpy(&temp_storage_bytes, + d_temp_storage_bytes, + sizeof(size_t) * 1, + cudaMemcpyDeviceToHost)); + + // Copy out error + CubDebugExit(cudaMemcpy(&retval, + d_cdp_error, + sizeof(cudaError_t) * 1, + cudaMemcpyDeviceToHost)); + + return retval; } - +// Specializations of Dispatch that translate the CDP backend to the appropriate +// CUB backend, and uses the CUB backend to launch the CDP kernel. +#define DEFINE_CDP_DISPATCHER(CdpBackend, CubBackend) \ + template \ + cudaError_t Dispatch(Int2Type, \ + int timing_iterations, \ + size_t *d_temp_storage_bytes, \ + cudaError_t *d_cdp_error, \ + \ + void *d_temp_storage, \ + size_t &temp_storage_bytes, \ + InputIteratorT d_in, \ + OutputIteratorT d_out, \ + int num_items, \ + int max_segments, \ + BeginOffsetIteratorT d_segment_begin_offsets, \ + EndOffsetIteratorT d_segment_end_offsets, \ + ReductionOpT reduction_op, \ + cudaStream_t stream, \ + bool debug_synchronous) \ + { \ + Int2Type cub_backend{}; \ + return LaunchCDPKernel(cub_backend, \ + timing_iterations, \ + d_temp_storage_bytes, \ + d_cdp_error, \ + d_temp_storage, \ + temp_storage_bytes, \ + d_in, \ + d_out, \ + num_items, \ + max_segments, \ + d_segment_begin_offsets, \ + d_segment_end_offsets, \ + reduction_op, \ + stream, \ + debug_synchronous); \ + } + +DEFINE_CDP_DISPATCHER(CDP, CUB) +DEFINE_CDP_DISPATCHER(CDP_SEGMENTED, CUB_SEGMENTED) + +#undef DEFINE_CDP_DISPATCHER + +#endif // TEST_CDP //--------------------------------------------------------------------- // Problem generation @@ -770,7 +871,7 @@ void Test( // Input data types using InputT = cub::detail::value_t; - // Allocate CUB_CDP device arrays for temp storage size and error + // Allocate CDP device arrays for temp storage size and error size_t *d_temp_storage_bytes = NULL; cudaError_t *d_cdp_error = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); @@ -859,8 +960,11 @@ void SolveAndTest( using OutputT = typename SolutionT::OutputT; printf("\n\n%s cub::DeviceReduce<%s> %d items (%s), %d segments\n", - (BACKEND == CUB_CDP) ? "CUB_CDP" : (BACKEND == CUB_SEGMENTED) ? "CUB_SEGMENTED" : "CUB", - typeid(ReductionOpT).name(), num_items, typeid(HostInputIteratorT).name(), num_segments); + BackendToString(BACKEND), + typeid(ReductionOpT).name(), + num_items, + typeid(HostInputIteratorT).name(), + num_segments); fflush(stdout); // Allocate and solve solution @@ -986,6 +1090,14 @@ void TestByBackend( OffsetT max_segments, GenMode gen_mode) { +#if TEST_CDP == 0 + constexpr auto NonSegmentedBackend = CUB; + constexpr auto SegmentedBackend = CUB_SEGMENTED; +#else // TEST_CDP + constexpr auto NonSegmentedBackend = CDP; + constexpr auto SegmentedBackend = CDP_SEGMENTED; +#endif // TEST_CDP + // Initialize host data printf("\n\nInitializing %d %s -> %s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout); @@ -1008,18 +1120,14 @@ void TestByBackend( InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input); // Page-aligned-input tests - TestByOp(h_in, d_in, num_items, 1, - h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL); // Host-dispatch -#ifdef CUB_CDP - TestByOp(h_in, d_in, num_items, 1, - h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL); // Device-dispatch -#endif + TestByOp(h_in, d_in, num_items, 1, + h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL); // Non-page-aligned-input tests if (num_items > 1) { InitializeSegments(num_items - 1, 1, h_segment_offsets, g_verbose_input); - TestByOp(h_in + 1, d_in + 1, num_items - 1, 1, + TestByOp(h_in + 1, d_in + 1, num_items - 1, 1, h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL); } @@ -1037,7 +1145,7 @@ void TestByBackend( // Test with segment pointer InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input); CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice)); - TestByOp(h_in, d_in, num_items, num_segments, + TestByOp(h_in, d_in, num_items, num_segments, h_segment_offsets, h_segment_offsets + 1, d_segment_offsets, d_segment_offsets + 1); // Test with segment iterator @@ -1050,7 +1158,7 @@ void TestByBackend( d_segment_offsets, identity_op); - TestByOp(h_in, d_in, num_items, num_segments, + TestByOp(h_in, d_in, num_items, num_segments, h_segment_offsets_itr, h_segment_offsets_itr + 1, d_segment_offsets_itr, d_segment_offsets_itr + 1); // Test with transform iterators of different types @@ -1064,7 +1172,7 @@ void TestByBackend( TransformInputIterator d_segment_begin_offsets_itr(d_segment_offsets, TransformFunctor1T()); TransformInputIterator d_segment_end_offsets_itr(d_segment_offsets + 1, TransformFunctor2T()); - TestByOp(h_in, d_in, num_items, num_segments, + TestByOp(h_in, d_in, num_items, num_segments, h_segment_begin_offsets_itr, h_segment_end_offsets_itr, d_segment_begin_offsets_itr, d_segment_end_offsets_itr); } @@ -1104,92 +1212,94 @@ void TestByGenMode( OffsetT *h_segment_offsets = new OffsetT[1 + 1]; InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input); - SolveAndTest(h_in, h_in, num_items, 1, - h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL, Sum()); -#ifdef CUB_CDP - SolveAndTest(h_in, h_in, num_items, 1, +#if TEST_CDP == 0 + constexpr auto Backend = CUB; +#else // TEST_CDP + constexpr auto Backend = CDP; +#endif // TEST_CDP + + SolveAndTest(h_in, h_in, num_items, 1, h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL, Sum()); -#endif if (h_segment_offsets) delete[] h_segment_offsets; } - /// Test different problem sizes -template < - typename InputT, - typename OutputT, - typename OffsetT> -struct TestBySize +template +void TestBySize(OffsetT max_items, OffsetT max_segments, OffsetT tile_size) { - OffsetT max_items; - OffsetT max_segments; - - TestBySize(OffsetT max_items, OffsetT max_segments) : - max_items(max_items), - max_segments(max_segments) - {} - - template - cudaError_t Invoke() - { - // - // Black-box testing on all backends - // - - // Test 0, 1, many - TestByGenMode(0, max_segments); - TestByGenMode(1, max_segments); - TestByGenMode(max_items, max_segments); - - // Test random problem sizes from a log-distribution [8, max_items-ish) - int num_iterations = 8; - double max_exp = log(double(max_items)) / log(double(2.0)); - for (int i = 0; i < num_iterations; ++i) - { - OffsetT num_items = (OffsetT) pow(2.0, RandomValue(max_exp - 3.0) + 3.0); - TestByGenMode(num_items, max_segments); - } - - // - // White-box testing of single-segment problems around specific sizes - // - - // Tile-boundaries: multiple blocks, one tile per block - OffsetT tile_size = ActivePolicyT::ReducePolicy::BLOCK_THREADS * ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD; - TestProblem(tile_size * 4, 1, RANDOM, Sum()); - TestProblem(tile_size * 4 + 1, 1, RANDOM, Sum()); - TestProblem(tile_size * 4 - 1, 1, RANDOM, Sum()); + // Test 0, 1, many + TestByGenMode(0, max_segments); + TestByGenMode(1, max_segments); + TestByGenMode(max_items, max_segments); + + // Test random problem sizes from a log-distribution [8, max_items-ish) + int num_iterations = 8; + double max_exp = log(double(max_items)) / log(double(2.0)); + for (int i = 0; i < num_iterations; ++i) + { + OffsetT num_items = (OffsetT)pow(2.0, RandomValue(max_exp - 3.0) + 3.0); + TestByGenMode(num_items, max_segments); + } + + // + // White-box testing of single-segment problems around specific sizes + // + +#if TEST_CDP == 0 + constexpr auto Backend = CUB; +#else // TEST_CDP + constexpr auto Backend = CDP; +#endif // TEST_CDP + + // Tile-boundaries: multiple blocks, one tile per block + TestProblem(tile_size * 4, 1, RANDOM, Sum()); + TestProblem(tile_size * 4 + 1, 1, RANDOM, Sum()); + TestProblem(tile_size * 4 - 1, 1, RANDOM, Sum()); + + // Tile-boundaries: multiple blocks, multiple tiles per block + OffsetT sm_occupancy = 32; + OffsetT occupancy = tile_size * sm_occupancy * g_sm_count; + TestProblem(occupancy, 1, RANDOM, Sum()); + TestProblem(occupancy + 1, 1, RANDOM, Sum()); + TestProblem(occupancy - 1, 1, RANDOM, Sum()); +} - // Tile-boundaries: multiple blocks, multiple tiles per block - OffsetT sm_occupancy = 32; - OffsetT occupancy = tile_size * sm_occupancy * g_sm_count; - TestProblem(occupancy, 1, RANDOM, Sum()); - TestProblem(occupancy + 1, 1, RANDOM, Sum()); - TestProblem(occupancy - 1, 1, RANDOM, Sum()); - return cudaSuccess; - } +template +struct GetTileSize +{ + OffsetT max_items{}; + OffsetT max_segments{}; + OffsetT tile_size{}; + + GetTileSize(OffsetT max_items, OffsetT max_segments) + : max_items(max_items) + , max_segments(max_segments) + {} + + template + CUB_RUNTIME_FUNCTION cudaError_t Invoke() + { + this->tile_size = ActivePolicyT::ReducePolicy::BLOCK_THREADS * + ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD; + return cudaSuccess; + } }; - /// Test problem type -template < - typename InputT, - typename OutputT, - typename OffsetT> -void TestType( - OffsetT max_items, - OffsetT max_segments) +template +void TestType(OffsetT max_items, OffsetT max_segments) { - typedef typename DeviceReducePolicy::MaxPolicy MaxPolicyT; + // Inspect the tuning policies to determine this arch's tile size: + using MaxPolicyT = + typename DeviceReducePolicy::MaxPolicy; + GetTileSize dispatch(max_items, max_segments); + CubDebugExit(MaxPolicyT::Invoke(g_ptx_version, dispatch)); - TestBySize dispatch(max_items, max_segments); - - MaxPolicyT::Invoke(g_ptx_version, dispatch); + TestBySize(max_items, max_segments, dispatch.tile_size); } - //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- @@ -1222,7 +1332,6 @@ int main(int argc, char** argv) "[--i= " "[--device=] " "[--v] " - "[--cdp]" "\n", argv[0]); exit(0); } @@ -1237,7 +1346,8 @@ int main(int argc, char** argv) // Get SM count g_sm_count = args.deviceProp.multiProcessorCount; - // %PARAM% TEST_TYPES types 0:1:2:3 + // %PARAM% TEST_CDP cdp 0:1 + // %PARAM% TEST_TYPES types 0:1:2:3 #if TEST_TYPES == 0 TestType(max_items, max_segments); diff --git a/test/test_device_reduce_by_key.cu b/test/test_device_reduce_by_key.cu index 168c4ecacc..103084d262 100644 --- a/test/test_device_reduce_by_key.cu +++ b/test/test_device_reduce_by_key.cu @@ -33,17 +33,19 @@ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR -#include -#include - -#include -#include #include #include +#include #include +#include + +#include #include "test_util.h" +#include +#include + using namespace cub; @@ -122,94 +124,132 @@ cudaError_t Dispatch( // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- +#if TEST_CDP == 1 + /** * Simple wrapper kernel to invoke DeviceSelect */ -template < - typename KeyInputIteratorT, - typename KeyOutputIteratorT, - typename ValueInputIteratorT, - typename ValueOutputIteratorT, - typename NumRunsIteratorT, - typename EqualityOpT, - typename ReductionOpT, - typename OffsetT> -__global__ void CnpDispatchKernel( - int timing_timing_iterations, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - - void *d_temp_storage, - size_t temp_storage_bytes, - KeyInputIteratorT d_keys_in, - KeyOutputIteratorT d_keys_out, - ValueInputIteratorT d_values_in, - ValueOutputIteratorT d_values_out, - NumRunsIteratorT d_num_runs, - EqualityOpT equality_op, - ReductionOpT reduction_op, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) +template +__global__ void CDPDispatchKernel(Int2Type cub_backend, + int timing_timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + + void *d_temp_storage, + size_t temp_storage_bytes, + KeyInputIteratorT d_keys_in, + KeyOutputIteratorT d_keys_out, + ValueInputIteratorT d_values_in, + ValueOutputIteratorT d_values_out, + NumRunsIteratorT d_num_runs, + EqualityOpT equality_op, + ReductionOpT reduction_op, + OffsetT num_items, + bool debug_synchronous) { -#ifndef CUB_CDP - *d_cdp_error = cudaErrorNotSupported; -#else - *d_cdp_error = Dispatch(Int2Type(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, - d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous); - - *d_temp_storage_bytes = temp_storage_bytes; -#endif + *d_cdp_error = Dispatch(cub_backend, + timing_timing_iterations, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + d_num_runs, + equality_op, + reduction_op, + num_items, + 0, + debug_synchronous); + + *d_temp_storage_bytes = temp_storage_bytes; } - /** * Dispatch to CDP kernel */ -template < - typename KeyInputIteratorT, - typename KeyOutputIteratorT, - typename ValueInputIteratorT, - typename ValueOutputIteratorT, - typename NumRunsIteratorT, - typename EqualityOpT, - typename ReductionOpT, - typename OffsetT> -CUB_RUNTIME_FUNCTION __forceinline__ -cudaError_t Dispatch( - Int2Type dispatch_to, - int timing_timing_iterations, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - - void *d_temp_storage, - size_t &temp_storage_bytes, - KeyInputIteratorT d_keys_in, - KeyOutputIteratorT d_keys_out, - ValueInputIteratorT d_values_in, - ValueOutputIteratorT d_values_out, - NumRunsIteratorT d_num_runs, - EqualityOpT equality_op, - ReductionOpT reduction_op, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) +template +__forceinline__ cudaError_t +Dispatch(Int2Type /*dispatch_to*/, + int timing_timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + + void *d_temp_storage, + size_t &temp_storage_bytes, + KeyInputIteratorT d_keys_in, + KeyOutputIteratorT d_keys_out, + ValueInputIteratorT d_values_in, + ValueOutputIteratorT d_values_out, + NumRunsIteratorT d_num_runs, + EqualityOpT equality_op, + ReductionOpT reduction_op, + OffsetT num_items, + cudaStream_t stream, + bool debug_synchronous) { - // Invoke kernel to invoke device-side dispatch - CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, - d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous); - - // Copy out temp_storage_bytes - CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); - - // Copy out error - cudaError_t retval; - CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); - return retval; + // Invoke kernel to invoke device-side dispatch + cudaError_t retval = + thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) + .doit(CDPDispatchKernel, + Int2Type{}, + timing_timing_iterations, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + d_num_runs, + equality_op, + reduction_op, + num_items, + debug_synchronous); + CubDebugExit(retval); + + // Copy out temp_storage_bytes + CubDebugExit(cudaMemcpy(&temp_storage_bytes, + d_temp_storage_bytes, + sizeof(size_t) * 1, + cudaMemcpyDeviceToHost)); + + // Copy out error + CubDebugExit(cudaMemcpy(&retval, + d_cdp_error, + sizeof(cudaError_t) * 1, + cudaMemcpyDeviceToHost)); + return retval; } - +#endif // TEST_CDP //--------------------------------------------------------------------- // Test generation @@ -595,10 +635,11 @@ void TestDispatch( int num_items, ReductionOpT reduction_op) { +#if TEST_CDP == 0 Test(num_items, reduction_op); -#ifdef CUB_CDP +#elif TEST_CDP == 1 Test(num_items, reduction_op); -#endif +#endif // TEST_CDP } @@ -671,7 +712,6 @@ int main(int argc, char** argv) "[--maxseg=]" "[--entropy=]" "[--v] " - "[--cdp]" "\n", argv[0]); exit(0); } @@ -684,6 +724,8 @@ int main(int argc, char** argv) int ptx_version = 0; CubDebugExit(PtxVersion(ptx_version)); + // %PARAM% TEST_CDP cdp 0:1 + // Test different input types TestOp(num_items); TestOp(num_items); diff --git a/test/test_device_run_length_encode.cu b/test/test_device_run_length_encode.cu index 95e6331cb5..6b4a0f7ead 100644 --- a/test/test_device_run_length_encode.cu +++ b/test/test_device_run_length_encode.cu @@ -27,23 +27,24 @@ ******************************************************************************/ /****************************************************************************** - * Test of DeviceReduce::RunLengthEncode utilities + * Test of DeviceRunLengthEncode utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR -#include -#include - -#include -#include -#include #include +#include #include +#include + +#include #include "test_util.h" +#include +#include + using namespace cub; @@ -174,94 +175,131 @@ cudaError_t Dispatch( // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- +#if TEST_CDP == 1 + /** * Simple wrapper kernel to invoke DeviceRunLengthEncode */ -template < - int RLE_METHOD, - typename InputIteratorT, - typename UniqueOutputIteratorT, - typename OffsetsOutputIteratorT, - typename LengthsOutputIteratorT, - typename NumRunsIterator, - typename EqualityOp, - typename OffsetT> -__global__ void CnpDispatchKernel( - Int2Type method, - int timing_timing_iterations, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - - void* d_temp_storage, - size_t temp_storage_bytes, - InputIteratorT d_in, - UniqueOutputIteratorT d_unique_out, - OffsetsOutputIteratorT d_offsets_out, - LengthsOutputIteratorT d_lengths_out, - NumRunsIterator d_num_runs, - cub::Equality equality_op, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) +template +__global__ void CDPDispatchKernel(Int2Type method, + Int2Type cub_backend, + int timing_timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + + void *d_temp_storage, + size_t temp_storage_bytes, + InputIteratorT d_in, + UniqueOutputIteratorT d_unique_out, + OffsetsOutputIteratorT d_offsets_out, + LengthsOutputIteratorT d_lengths_out, + NumRunsIterator d_num_runs, + cub::Equality equality_op, + OffsetT num_items, + bool debug_synchronous) { - -#ifndef CUB_CDP - *d_cdp_error = cudaErrorNotSupported; -#else - *d_cdp_error = Dispatch(method, Int2Type(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, - d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous); - - *d_temp_storage_bytes = temp_storage_bytes; -#endif + *d_cdp_error = Dispatch(method, + cub_backend, + timing_timing_iterations, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_in, + d_unique_out, + d_offsets_out, + d_lengths_out, + d_num_runs, + equality_op, + num_items, + 0, + debug_synchronous); + + *d_temp_storage_bytes = temp_storage_bytes; } - /** * Dispatch to CDP kernel */ -template < - int RLE_METHOD, - typename InputIteratorT, - typename UniqueOutputIteratorT, - typename OffsetsOutputIteratorT, - typename LengthsOutputIteratorT, - typename NumRunsIterator, - typename EqualityOp, - typename OffsetT> -CUB_RUNTIME_FUNCTION __forceinline__ -cudaError_t Dispatch( - Int2Type method, - Int2Type dispatch_to, - int timing_timing_iterations, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - - void* d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - UniqueOutputIteratorT d_unique_out, - OffsetsOutputIteratorT d_offsets_out, - LengthsOutputIteratorT d_lengths_out, - NumRunsIterator d_num_runs, - EqualityOp equality_op, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) +template +__forceinline__ cudaError_t +Dispatch(Int2Type method, + Int2Type /*dispatch_to*/, + int timing_timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + + void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + UniqueOutputIteratorT d_unique_out, + OffsetsOutputIteratorT d_offsets_out, + LengthsOutputIteratorT d_lengths_out, + NumRunsIterator d_num_runs, + EqualityOp equality_op, + OffsetT num_items, + cudaStream_t stream, + bool debug_synchronous) { - // Invoke kernel to invoke device-side dispatch - CnpDispatchKernel<<<1,1>>>(method, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, - d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous); - - // Copy out temp_storage_bytes - CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); - - // Copy out error - cudaError_t retval; - CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); - return retval; + // Invoke kernel to invoke device-side dispatch + cudaError_t retval = + thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) + .doit(CDPDispatchKernel, + method, + Int2Type{}, + timing_timing_iterations, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_in, + d_unique_out, + d_offsets_out, + d_lengths_out, + d_num_runs, + equality_op, + num_items, + debug_synchronous); + CubDebugExit(retval); + + // Copy out temp_storage_bytes + CubDebugExit(cudaMemcpy(&temp_storage_bytes, + d_temp_storage_bytes, + sizeof(size_t) * 1, + cudaMemcpyDeviceToHost)); + + // Copy out error + CubDebugExit(cudaMemcpy(&retval, + d_cdp_error, + sizeof(cudaError_t) * 1, + cudaMemcpyDeviceToHost)); + return retval; } - +#endif // TEST_CDP //--------------------------------------------------------------------- // Test generation @@ -533,7 +571,7 @@ void TestPointer( int num_runs = Solve(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items); printf("\nPointer %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}, max_segment %d, entropy_reduction %d\n", - (RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other", + (RLE_METHOD == RLE) ? "DeviceRunLengthEncode::Encode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other", (BACKEND == CDP) ? "CDP CUB" : "CUB", num_items, num_runs, float(num_items) / num_runs, typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name(), @@ -586,7 +624,7 @@ void TestIterator( int num_runs = Solve(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items); printf("\nIterator %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}\n", - (RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other", + (RLE_METHOD == RLE) ? "DeviceRunLengthEncode::Encode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other", (BACKEND == CDP) ? "CDP CUB" : "CUB", num_items, num_runs, float(num_items) / num_runs, typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name()); @@ -653,10 +691,10 @@ template < void TestDispatch( int num_items) { +#if TEST_CDP == 0 Test(num_items); Test(num_items); - -#ifdef CUB_CDP +#elif TEST_CDP == 1 Test(num_items); Test(num_items); #endif @@ -722,6 +760,8 @@ int main(int argc, char** argv) CubDebugExit(args.DeviceInit()); printf("\n"); + // %PARAM% TEST_CDP cdp 0:1 + // Test different input types TestSize(num_items); TestSize(num_items); diff --git a/test/test_device_scan.cu b/test/test_device_scan.cu index b9f4e41610..ca2b320151 100644 --- a/test/test_device_scan.cu +++ b/test/test_device_scan.cu @@ -33,17 +33,19 @@ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR -#include -#include - #include #include #include #include #include +#include + #include "test_util.h" +#include +#include + using namespace cub; @@ -384,46 +386,37 @@ Dispatch(Int2Type /*in_place*/, // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- +#if TEST_CDP == 1 + /** * Simple wrapper kernel to invoke DeviceScan */ -template -__global__ void CnpDispatchKernel(Int2Type /*in_place*/, + typename OffsetT> +__global__ void CDPDispatchKernel(InPlaceT in_place, + CubBackendT cub_backend, IsPrimitiveT is_primitive, - int timing_timing_iterations, - size_t *d_temp_storage_bytes, + int timing_timing_iterations, + size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, - void *d_temp_storage, - size_t temp_storage_bytes, - InputIteratorT d_in, + + void *d_temp_storage, + size_t temp_storage_bytes, + InputIteratorT d_in, OutputIteratorT d_out, - ScanOpT scan_op, - InitialValueT initial_value, - OffsetT num_items, - bool debug_synchronous) + ScanOpT scan_op, + InitialValueT initial_value, + OffsetT num_items, + bool debug_synchronous) { -#ifndef CUB_CDP - (void)is_primitive; - (void)timing_timing_iterations; - (void)d_temp_storage_bytes; - (void)d_cdp_error; - (void)d_temp_storage; - (void)temp_storage_bytes; - (void)d_in; - (void)d_out; - (void)scan_op; - (void)initial_value; - (void)num_items; - (void)debug_synchronous; - *d_cdp_error = cudaErrorNotSupported; -#else - *d_cdp_error = Dispatch(Int2Type(), + *d_cdp_error = Dispatch(in_place, + cub_backend, is_primitive, timing_timing_iterations, d_temp_storage_bytes, @@ -439,48 +432,64 @@ __global__ void CnpDispatchKernel(Int2Type /*in_place*/, debug_synchronous); *d_temp_storage_bytes = temp_storage_bytes; -#endif } /** * Dispatch to CDP kernel */ -template -cudaError_t Dispatch(Int2Type /*in_place*/, + typename OffsetT> +cudaError_t Dispatch(InPlaceT in_place, Int2Type dispatch_to, - IsPrimitiveT is_primitive, - int timing_timing_iterations, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, + IsPrimitiveT is_primitive, + int timing_timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + + void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, OutputIteratorT d_out, - ScanOpT scan_op, - InitialValueT initial_value, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) + ScanOpT scan_op, + InitialValueT initial_value, + OffsetT num_items, + cudaStream_t stream, + bool debug_synchronous) { - // Invoke kernel to invoke device-side dispatch - CnpDispatchKernel<<<1, 1>>>(is_primitive, - timing_timing_iterations, - d_temp_storage_bytes, - d_cdp_error, - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - initial_value, - num_items, - debug_synchronous); + // Invoke kernel to invoke device-side dispatch to CUB backend: + (void)dispatch_to; + using CubBackendT = Int2Type; + CubBackendT cub_backend; + cudaError_t retval = + thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) + .doit(CDPDispatchKernel, + in_place, + cub_backend, + is_primitive, + timing_timing_iterations, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + initial_value, + num_items, + debug_synchronous); + CubDebugExit(retval); // Copy out temp_storage_bytes CubDebugExit(cudaMemcpy(&temp_storage_bytes, @@ -489,7 +498,6 @@ cudaError_t Dispatch(Int2Type /*in_place*/, cudaMemcpyDeviceToHost)); // Copy out error - cudaError_t retval; CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, @@ -497,6 +505,8 @@ cudaError_t Dispatch(Int2Type /*in_place*/, return retval; } +#endif // TEST_CDP + //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- @@ -991,10 +1001,11 @@ void Test( ScanOpT scan_op, InitialValueT initial_value) { +#if TEST_CDP == 0 Test(num_items, scan_op, initial_value); -#ifdef CUB_CDP +#elif TEST_CDP == 1 Test(num_items, scan_op, initial_value); -#endif +#endif // TEST_CDP } @@ -1082,6 +1093,7 @@ int main(int argc, char** argv) g_device_giga_bandwidth = args.device_giga_bandwidth; printf("\n"); + // %PARAM% TEST_CDP cdp 0:1 // %PARAM% TEST_VALUE_TYPES types 0:1:2 #if TEST_VALUE_TYPES == 0 diff --git a/test/test_device_scan_by_key.cu b/test/test_device_scan_by_key.cu index d6b0da67aa..db46a04661 100644 --- a/test/test_device_scan_by_key.cu +++ b/test/test_device_scan_by_key.cu @@ -32,18 +32,20 @@ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR -#include -#include - -#include +#include #include -#include #include +#include #include -#include +#include + +#include #include "test_util.h" +#include +#include + using namespace cub; @@ -217,7 +219,6 @@ cudaError_t Dispatch( return error; } - /** * Dispatch to inclusive sum entrypoint */ @@ -254,114 +255,130 @@ cudaError_t Dispatch( // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- +#if TEST_CDP == 1 + /** * Simple wrapper kernel to invoke DeviceScan */ -template -__global__ void CnpDispatchKernel( - IsPrimitiveT is_primitive, - int timing_timing_iterations, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - - void* d_temp_storage, - size_t temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - ScanOpT scan_op, - InitialValueT initial_value, - OffsetT num_items, - EqualityOpT equality_op, - bool debug_synchronous) +template +__global__ void CDPDispatchKernel(Int2Type cub_backend, + IsPrimitiveT is_primitive, + int timing_timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + + void *d_temp_storage, + size_t temp_storage_bytes, + KeysInputIteratorT d_keys_in, + ValuesInputIteratorT d_values_in, + ValuesOutputIteratorT d_values_out, + ScanOpT scan_op, + InitialValueT initial_value, + OffsetT num_items, + EqualityOpT equality_op, + bool debug_synchronous) { -#ifndef CUB_CDP - (void)is_primitive; - (void)timing_timing_iterations; - (void)d_temp_storage_bytes; - (void)d_cdp_error; - (void)d_temp_storage; - (void)temp_storage_bytes; - (void)d_keys_in; - (void)d_values_in; - (void)d_values_out; - (void)scan_op; - (void)initial_value; - (void)num_items; - (void)equality_op; - (void)debug_synchronous; - *d_cdp_error = cudaErrorNotSupported; -#else - *d_cdp_error = Dispatch( - Int2Type(), - is_primitive, - timing_timing_iterations, - d_temp_storage_bytes, - d_cdp_error, - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - initial_value, - num_items, - 0, - debug_synchronous); - - *d_temp_storage_bytes = temp_storage_bytes; -#endif + *d_cdp_error = Dispatch(cub_backend, + is_primitive, + timing_timing_iterations, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + d_values_out, + scan_op, + initial_value, + num_items, + equality_op, + 0, + debug_synchronous); + + *d_temp_storage_bytes = temp_storage_bytes; } - /** * Dispatch to CDP kernel */ -template -cudaError_t Dispatch( - Int2Type dispatch_to, - IsPrimitiveT is_primitive, - int timing_timing_iterations, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - ScanOpT scan_op, - InitialValueT initial_value, - OffsetT num_items, - EqualityOpT equality_op, - cudaStream_t stream, - bool debug_synchronous) +template +cudaError_t Dispatch(Int2Type /*dispatch_to*/, + IsPrimitiveT is_primitive, + int timing_timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + + void *d_temp_storage, + size_t &temp_storage_bytes, + KeysInputIteratorT d_keys_in, + ValuesInputIteratorT d_values_in, + ValuesOutputIteratorT d_values_out, + ScanOpT scan_op, + InitialValueT initial_value, + OffsetT num_items, + EqualityOpT equality_op, + cudaStream_t stream, + bool debug_synchronous) { - // Invoke kernel to invoke device-side dispatch - CnpDispatchKernel<<<1,1>>>( - is_primitive, - timing_timing_iterations, - d_temp_storage_bytes, - d_cdp_error, - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_values_in, - d_values_out, - scan_op, - initial_value, - equality_op, - num_items, - debug_synchronous); - - // Copy out temp_storage_bytes - CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); - - // Copy out error - cudaError_t retval; - CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); - return retval; + // Invoke kernel to invoke device-side dispatch + cudaError_t retval = + thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) + .doit(CDPDispatchKernel, + Int2Type{}, + is_primitive, + timing_timing_iterations, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + d_values_out, + scan_op, + initial_value, + num_items, + equality_op, + debug_synchronous); + CubDebugExit(retval); + + // Copy out temp_storage_bytes + CubDebugExit(cudaMemcpy(&temp_storage_bytes, + d_temp_storage_bytes, + sizeof(size_t) * 1, + cudaMemcpyDeviceToHost)); + + // Copy out error + CubDebugExit(cudaMemcpy(&retval, + d_cdp_error, + sizeof(cudaError_t) * 1, + cudaMemcpyDeviceToHost)); + return retval; } +#endif // TEST_CDP //--------------------------------------------------------------------- // Test generation @@ -924,10 +941,11 @@ void Test( InitialValueT initial_value, EqualityOpT equality_op) { +#if TEST_CDP == 0 Test(num_items, scan_op, initial_value, equality_op); -#ifdef CUB_CDP +#elif TEST_CDP == 1 Test(num_items, scan_op, initial_value, equality_op); -#endif +#endif // TEST_CDP } @@ -1019,7 +1037,6 @@ int main(int argc, char** argv) "[--i= " "[--device=] " "[--v] " - "[--cdp]" "\n", argv[0]); exit(0); } @@ -1029,6 +1046,7 @@ int main(int argc, char** argv) g_device_giga_bandwidth = args.device_giga_bandwidth; printf("\n"); + // %PARAM% TEST_CDP cdp 0:1 // %PARAM% TEST_VALUE_TYPES types 0:1:2:3:4:5 #if TEST_VALUE_TYPES == 0 diff --git a/test/test_device_segmented_sort.cu b/test/test_device_segmented_sort.cu index 32a764f864..d1f58a1f3c 100644 --- a/test/test_device_segmented_sort.cu +++ b/test/test_device_segmented_sort.cu @@ -1475,7 +1475,7 @@ void InputTestRandom(Input &input) } #endif -AssertTrue(keys_ok); + AssertTrue(keys_ok); AssertTrue(values_ok); input.shuffle(); @@ -1659,7 +1659,7 @@ void Test() } -#ifdef CUB_CDP +#if TEST_CDP == 1 template __global__ void LauncherKernel( void *tmp_storage, @@ -1771,7 +1771,8 @@ void TestDeviceSideLaunch() TestDeviceSideLaunch(1 << 2, 1 << 8); TestDeviceSideLaunch(1 << 9, 1 << 19); } -#endif + +#endif // TEST_CDP int main(int argc, char** argv) @@ -1781,10 +1782,9 @@ int main(int argc, char** argv) // Initialize device CubDebugExit(args.DeviceInit()); -#ifdef CUB_CDP - TestDeviceSideLaunch(); -#endif + // %PARAM% TEST_CDP cdp 0:1 +#if TEST_CDP == 0 TestZeroSegments(); TestEmptySegments(1 << 2); TestEmptySegments(1 << 22); @@ -1800,5 +1800,9 @@ int main(int argc, char** argv) Test(); Test(); +#elif TEST_CDP == 1 + TestDeviceSideLaunch(); +#endif // TEST_CDP + return 0; } diff --git a/test/test_device_select_if.cu b/test/test_device_select_if.cu index c917bfd45e..980dd4529c 100644 --- a/test/test_device_select_if.cu +++ b/test/test_device_select_if.cu @@ -33,13 +33,10 @@ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR -#include -#include - -#include #include #include #include +#include #include #include @@ -54,6 +51,9 @@ #include "test_util.h" +#include +#include + using namespace cub; @@ -229,88 +229,130 @@ cudaError_t Dispatch( // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- +#if TEST_CDP == 1 + /** * Simple wrapper kernel to invoke DeviceSelect */ -template -__global__ void CnpDispatchKernel( - IsFlaggedTag is_flagged, - IsPartitionTag is_partition, - int timing_timing_iterations, - size_t* d_temp_storage_bytes, - cudaError_t* d_cdp_error, - - void* d_temp_storage, - size_t temp_storage_bytes, - InputIteratorT d_in, - FlagIteratorT d_flags, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - OffsetT num_items, - SelectOpT select_op, - bool debug_synchronous) +template +__global__ void CDPDispatchKernel(Int2Type cub_backend, + IsFlaggedTag is_flagged, + IsPartitionTag is_partition, + int timing_timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + + void *d_temp_storage, + size_t temp_storage_bytes, + InputIteratorT d_in, + FlagIteratorT d_flags, + OutputIteratorT d_out, + NumSelectedIteratorT d_num_selected_out, + OffsetT num_items, + SelectOpT select_op, + bool debug_synchronous) { + *d_cdp_error = Dispatch(cub_backend, + is_flagged, + is_partition, + timing_timing_iterations, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected_out, + num_items, + select_op, + 0, + debug_synchronous); -#ifndef CUB_CDP - (void)is_flagged; - (void)is_partition; - (void)timing_timing_iterations; - (void)d_temp_storage_bytes; - (void)d_temp_storage; - (void)temp_storage_bytes; - (void)d_in; - (void)d_flags; - (void)d_out; - (void)d_num_selected_out; - (void)num_items; - (void)select_op; - (void)debug_synchronous; - *d_cdp_error = cudaErrorNotSupported; -#else - *d_cdp_error = Dispatch(Int2Type(), is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, - d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, debug_synchronous); - *d_temp_storage_bytes = temp_storage_bytes; -#endif + *d_temp_storage_bytes = temp_storage_bytes; } - /** * Dispatch to CDP kernel */ -template -cudaError_t Dispatch( - Int2Type dispatch_to, - IsFlaggedTag is_flagged, - IsPartitionTag is_partition, - int timing_timing_iterations, - size_t* d_temp_storage_bytes, - cudaError_t* d_cdp_error, - - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - FlagIteratorT d_flags, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - OffsetT num_items, - SelectOpT select_op, - cudaStream_t stream, - bool debug_synchronous) +template +cudaError_t Dispatch(Int2Type /*dispatch_to*/, + IsFlaggedTag is_flagged, + IsPartitionTag is_partition, + int timing_timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + + void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + FlagIteratorT d_flags, + OutputIteratorT d_out, + NumSelectedIteratorT d_num_selected_out, + OffsetT num_items, + SelectOpT select_op, + cudaStream_t stream, + bool debug_synchronous) { - // Invoke kernel to invoke device-side dispatch - CnpDispatchKernel<<<1,1>>>(is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, - d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, debug_synchronous); - - // Copy out temp_storage_bytes - CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); - - // Copy out error - cudaError_t retval; - CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); - return retval; + // Invoke kernel to invoke device-side dispatch + cudaError_t retval = + thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) + .doit(CDPDispatchKernel, + Int2Type{}, + is_flagged, + is_partition, + timing_timing_iterations, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected_out, + num_items, + select_op, + debug_synchronous); + CubDebugExit(retval); + + // Copy out temp_storage_bytes + CubDebugExit(cudaMemcpy(&temp_storage_bytes, + d_temp_storage_bytes, + sizeof(size_t) * 1, + cudaMemcpyDeviceToHost)); + + // Copy out error + CubDebugExit(cudaMemcpy(&retval, + d_cdp_error, + sizeof(cudaError_t) * 1, + cudaMemcpyDeviceToHost)); + return retval; } - +#endif // TEST_CDP //--------------------------------------------------------------------- // Test generation @@ -635,10 +677,11 @@ template < void TestOp( int num_items) { +#if TEST_CDP == 0 TestMethod(num_items); -#ifdef CUB_CDP +#elif TEST_CDP == 1 TestMethod(num_items); -#endif +#endif // TEST_CDP } @@ -981,8 +1024,8 @@ void TestFlaggedInPlaceWithAliasedFlags() int *d_num_out = thrust::raw_pointer_cast(num_out.data()); int *d_data = thrust::raw_pointer_cast(data.data()); int *d_flags = d_data; // alias - int *d_allocated_flags = thrust::raw_pointer_cast(data.data()); - int *d_reference = thrust::raw_pointer_cast(reference.data()); + int *d_allocated_flags = thrust::raw_pointer_cast(data.data()); + int *d_reference = thrust::raw_pointer_cast(reference.data()); void *d_tmp_storage{}; std::size_t tmp_storage_size{}; @@ -1070,7 +1113,6 @@ int main(int argc, char** argv) "[--device=] " "[--ratio=] " "[--v] " - "[--cdp] " "\n", argv[0]); exit(0); } @@ -1080,6 +1122,8 @@ int main(int argc, char** argv) g_device_giga_bandwidth = args.device_giga_bandwidth; printf("\n"); + // %PARAM% TEST_CDP cdp 0:1 + TestFlagsAliasingInPartition(); TestFlaggedInPlace(); diff --git a/test/test_device_select_unique.cu b/test/test_device_select_unique.cu index c7f6679278..64964e1b37 100644 --- a/test/test_device_select_unique.cu +++ b/test/test_device_select_unique.cu @@ -33,19 +33,20 @@ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR -#include -#include - -#include -#include #include +#include +#include +#include #include #include -#include +#include #include "test_util.h" +#include +#include + using namespace cub; @@ -103,77 +104,102 @@ cudaError_t Dispatch( // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- +#if TEST_CDP == 1 + /** * Simple wrapper kernel to invoke DeviceSelect */ -template -__global__ void CnpDispatchKernel( - int timing_timing_iterations, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - - void* d_temp_storage, - size_t temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - OffsetT num_items, - bool debug_synchronous) +template +__global__ void CDPDispatchKernel(Int2Type cub_backend, + int timing_timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + + void *d_temp_storage, + size_t temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + NumSelectedIteratorT d_num_selected_out, + OffsetT num_items, + bool debug_synchronous) { - -#ifndef CUB_CDP - (void)timing_timing_iterations; - (void)d_temp_storage_bytes; - (void)d_cdp_error; - (void)d_temp_storage; - (void)temp_storage_bytes; - (void)d_in; - (void)d_out; - (void)d_num_selected_out; - (void)num_items; - (void)debug_synchronous; - *d_cdp_error = cudaErrorNotSupported; -#else - *d_cdp_error = Dispatch(Int2Type(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, - d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, debug_synchronous); - *d_temp_storage_bytes = temp_storage_bytes; -#endif + *d_cdp_error = Dispatch(cub_backend, + timing_timing_iterations, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + d_num_selected_out, + num_items, + 0, + debug_synchronous); + + *d_temp_storage_bytes = temp_storage_bytes; } - /** * Dispatch to CDP kernel */ -template -cudaError_t Dispatch( - Int2Type dispatch_to, - int timing_timing_iterations, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - - void* d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) +template +cudaError_t Dispatch(Int2Type /*dispatch_to*/, + int timing_timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + + void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + NumSelectedIteratorT d_num_selected_out, + OffsetT num_items, + cudaStream_t stream, + bool debug_synchronous) { - // Invoke kernel to invoke device-side dispatch - CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, - d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, debug_synchronous); - - // Copy out temp_storage_bytes - CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); - - // Copy out error - cudaError_t retval; - CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); - return retval; + // Invoke kernel to invoke device-side dispatch + cudaError_t retval = + thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) + .doit(CDPDispatchKernel, + Int2Type{}, + timing_timing_iterations, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + d_num_selected_out, + num_items, + debug_synchronous); + CubDebugExit(retval); + + // Copy out temp_storage_bytes + CubDebugExit(cudaMemcpy(&temp_storage_bytes, + d_temp_storage_bytes, + sizeof(size_t) * 1, + cudaMemcpyDeviceToHost)); + + // Copy out error + CubDebugExit(cudaMemcpy(&retval, + d_cdp_error, + sizeof(cudaError_t) * 1, + cudaMemcpyDeviceToHost)); + return retval; } - +#endif // TEST_CDP //--------------------------------------------------------------------- // Test generation @@ -450,10 +476,11 @@ template < void TestOp( int num_items) { +#if TEST_CDP == 0 Test(num_items); -#ifdef CUB_CDP +#elif TEST_CDP == 1 Test(num_items); -#endif +#endif // TEST_CDP } @@ -562,7 +589,6 @@ int main(int argc, char** argv) "[--maxseg=]" "[--entropy=]" "[--v] " - "[--cdp]" "\n", argv[0]); exit(0); } @@ -572,6 +598,8 @@ int main(int argc, char** argv) g_device_giga_bandwidth = args.device_giga_bandwidth; printf("\n"); + // %PARAM% TEST_CDP cdp 0:1 + // Test different input types Test(num_items); Test(num_items); diff --git a/test/test_device_select_unique_by_key.cu b/test/test_device_select_unique_by_key.cu index d8db2f7935..bcfa662c01 100644 --- a/test/test_device_select_unique_by_key.cu +++ b/test/test_device_select_unique_by_key.cu @@ -32,15 +32,17 @@ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR -#include -#include - -#include -#include #include +#include +#include + +#include #include "test_util.h" +#include +#include + using namespace cub; @@ -101,83 +103,116 @@ cudaError_t Dispatch( // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- +#if TEST_CDP == 1 + /** * Simple wrapper kernel to invoke DeviceSelect */ -template -__global__ void CnpDispatchKernel( - int timing_timing_iterations, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - - void* d_temp_storage, - size_t temp_storage_bytes, - KeyInputIteratorT d_keys_in, - ValueInputIteratorT d_values_in, - KeyOutputIteratorT d_keys_out, - ValueOutputIteratorT d_values_out, - NumSelectedIteratorT d_num_selected_out, - OffsetT num_items, - bool debug_synchronous) +template +__global__ void CDPDispatchKernel(Int2Type cub_backend, + int timing_timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + + void *d_temp_storage, + size_t temp_storage_bytes, + KeyInputIteratorT d_keys_in, + ValueInputIteratorT d_values_in, + KeyOutputIteratorT d_keys_out, + ValueOutputIteratorT d_values_out, + NumSelectedIteratorT d_num_selected_out, + OffsetT num_items, + bool debug_synchronous) { - -#ifndef CUB_CDP - (void)timing_timing_iterations; - (void)d_temp_storage_bytes; - (void)d_cdp_error; - (void)d_temp_storage; - (void)temp_storage_bytes; - (void)d_keys_in; - (void)d_values_in; - (void)d_keys_out; - (void)d_values_out; - (void)d_num_selected_out; - (void)num_items; - (void)debug_synchronous; - *d_cdp_error = cudaErrorNotSupported; -#else - *d_cdp_error = Dispatch(Int2Type(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, - d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, 0, debug_synchronous); - *d_temp_storage_bytes = temp_storage_bytes; -#endif + *d_cdp_error = Dispatch(cub_backend, + timing_timing_iterations, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + d_keys_out, + d_values_out, + d_num_selected_out, + num_items, + 0, + debug_synchronous); + + *d_temp_storage_bytes = temp_storage_bytes; } - /** * Dispatch to CDP kernel */ -template -cudaError_t Dispatch( - Int2Type dispatch_to, - int timing_timing_iterations, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - - void* d_temp_storage, - size_t &temp_storage_bytes, - KeyInputIteratorT d_keys_in, - ValueInputIteratorT d_values_in, - KeyOutputIteratorT d_keys_out, - ValueOutputIteratorT d_values_out, - NumSelectedIteratorT d_num_selected_out, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) +template +cudaError_t Dispatch(Int2Type /*dispatch_to*/, + int timing_timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + + void *d_temp_storage, + size_t &temp_storage_bytes, + KeyInputIteratorT d_keys_in, + ValueInputIteratorT d_values_in, + KeyOutputIteratorT d_keys_out, + ValueOutputIteratorT d_values_out, + NumSelectedIteratorT d_num_selected_out, + OffsetT num_items, + cudaStream_t stream, + bool debug_synchronous) { - // Invoke kernel to invoke device-side dispatch - CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, - d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, debug_synchronous); - - // Copy out temp_storage_bytes - CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); - - // Copy out error - cudaError_t retval; - CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); - return retval; + // Invoke kernel to invoke device-side dispatch + cudaError_t retval = + thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) + .doit(CDPDispatchKernel, + Int2Type{}, + timing_timing_iterations, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + d_keys_out, + d_values_out, + d_num_selected_out, + num_items, + debug_synchronous); + CubDebugExit(retval); + + // Copy out temp_storage_bytes + CubDebugExit(cudaMemcpy(&temp_storage_bytes, + d_temp_storage_bytes, + sizeof(size_t) * 1, + cudaMemcpyDeviceToHost)); + + // Copy out error + CubDebugExit(cudaMemcpy(&retval, + d_cdp_error, + sizeof(cudaError_t) * 1, + cudaMemcpyDeviceToHost)); + return retval; } - +#endif // TEST_CDP //--------------------------------------------------------------------- // Test generation @@ -486,10 +521,11 @@ template < void TestOp( int num_items) { +#if TEST_CDP == 0 Test(num_items); -#ifdef CUB_CDP +#elif TEST_CDP == 1 Test(num_items); -#endif +#endif // TEST_CDP } @@ -550,7 +586,6 @@ int main(int argc, char** argv) "[--maxseg=]" "[--entropy=]" "[--v] " - "[--cdp]" "\n", argv[0]); exit(0); } @@ -560,6 +595,8 @@ int main(int argc, char** argv) g_device_giga_bandwidth = args.device_giga_bandwidth; printf("\n"); + // %PARAM% TEST_CDP cdp 0:1 + // Test different input types Test(num_items); Test(num_items); diff --git a/test/test_util.h b/test/test_util.h index d7b5d28f7c..8d00e435a2 100644 --- a/test/test_util.h +++ b/test/test_util.h @@ -811,13 +811,13 @@ std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair< PRIMITIVE = false, \ NULL_TYPE = false, \ }; \ - static T Max() \ + static __host__ __device__ T Max() \ { \ T retval = { \ NumericTraits::Max()}; \ return retval; \ } \ - static T Lowest() \ + static __host__ __device__ T Lowest() \ { \ T retval = { \ NumericTraits::Lowest()}; \ @@ -899,14 +899,14 @@ std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair< PRIMITIVE = false, \ NULL_TYPE = false, \ }; \ - static T Max() \ + static __host__ __device__ T Max() \ { \ T retval = { \ NumericTraits::Max(), \ NumericTraits::Max()}; \ return retval; \ } \ - static T Lowest() \ + static __host__ __device__ T Lowest() \ { \ T retval = { \ NumericTraits::Lowest(), \ @@ -996,7 +996,7 @@ std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair< PRIMITIVE = false, \ NULL_TYPE = false, \ }; \ - static T Max() \ + static __host__ __device__ T Max() \ { \ T retval = { \ NumericTraits::Max(), \ @@ -1004,7 +1004,7 @@ std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair< NumericTraits::Max()}; \ return retval; \ } \ - static T Lowest() \ + static __host__ __device__ T Lowest() \ { \ T retval = { \ NumericTraits::Lowest(), \ @@ -1101,7 +1101,7 @@ std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair< PRIMITIVE = false, \ NULL_TYPE = false, \ }; \ - static T Max() \ + static __host__ __device__ T Max() \ { \ T retval = { \ NumericTraits::Max(), \ @@ -1110,7 +1110,7 @@ std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair< NumericTraits::Max()}; \ return retval; \ } \ - static T Lowest() \ + static __host__ __device__ T Lowest() \ { \ T retval = { \ NumericTraits::Lowest(), \ @@ -1253,7 +1253,7 @@ struct NumericTraits PRIMITIVE = false, NULL_TYPE = false, }; - static TestFoo Max() + __host__ __device__ static TestFoo Max() { return TestFoo::MakeTestFoo( NumericTraits::Max(), @@ -1262,7 +1262,7 @@ struct NumericTraits NumericTraits::Max()); } - static TestFoo Lowest() + __host__ __device__ static TestFoo Lowest() { return TestFoo::MakeTestFoo( NumericTraits::Lowest(), @@ -1369,14 +1369,14 @@ struct NumericTraits PRIMITIVE = false, NULL_TYPE = false, }; - static TestBar Max() + __host__ __device__ static TestBar Max() { return TestBar( NumericTraits::Max(), NumericTraits::Max()); } - static TestBar Lowest() + __host__ __device__ static TestBar Lowest() { return TestBar( NumericTraits::Lowest(), @@ -1513,6 +1513,11 @@ int CompareDeviceResults( bool verbose = true, bool display_data = false) { + if (num_items == 0) + { + return 0; + } + // Allocate array on host T *h_data = (T*) malloc(num_items * sizeof(T));