From a064b129673e1f68c511540934dd9d70c6b16683 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Sat, 4 Jun 2022 12:41:23 +0400 Subject: [PATCH] P2322R6 accumulator types for reduce --- cub/agent/agent_reduce.cuh | 661 +++--- .../block_reduce_warp_reductions.cuh | 2 +- cub/detail/type_traits.cuh | 14 +- cub/device/device_reduce.cuh | 32 +- cub/device/dispatch/dispatch_reduce.cuh | 1838 ++++++++++------- cub/thread/thread_operators.cuh | 434 ++-- cub/thread/thread_reduce.cuh | 27 +- test/test_device_reduce.cu | 190 +- test/test_thread_operators.cu | 259 +++ 9 files changed, 2221 insertions(+), 1236 deletions(-) create mode 100644 test/test_thread_operators.cu diff --git a/cub/agent/agent_reduce.cuh b/cub/agent/agent_reduce.cuh index 73a2ba492a..aa18efa07e 100644 --- a/cub/agent/agent_reduce.cuh +++ b/cub/agent/agent_reduce.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -13,10 +13,10 @@ * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND @@ -27,357 +27,418 @@ ******************************************************************************/ /** - * \file - * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction . + * @file cub::AgentReduce implements a stateful abstraction of CUDA thread + * blocks for participating in device-wide reduction. */ #pragma once #include -#include "../block/block_load.cuh" -#include "../block/block_reduce.cuh" -#include "../grid/grid_mapping.cuh" -#include "../grid/grid_even_share.cuh" -#include "../config.cuh" -#include "../util_type.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" - +#include +#include +#include +#include +#include +#include +#include +#include CUB_NAMESPACE_BEGIN - /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * Parameterizable tuning policy type for AgentReduce + * @tparam NOMINAL_BLOCK_THREADS_4B Threads per thread block + * @tparam NOMINAL_ITEMS_PER_THREAD_4B Items per thread (per tile of input) + * @tparam ComputeT Dominant compute type + * @tparam _VECTOR_LOAD_LENGTH Number of items per vectorized load + * @tparam _BLOCK_ALGORITHM Cooperative block-wide reduction algorithm to use + * @tparam _LOAD_MODIFIER Cache load modifier for reading input elements */ -template < - int NOMINAL_BLOCK_THREADS_4B, ///< Threads per thread block - int NOMINAL_ITEMS_PER_THREAD_4B, ///< Items per thread (per tile of input) - typename ComputeT, ///< Dominant compute type - int _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load - BlockReduceAlgorithm _BLOCK_ALGORITHM, ///< Cooperative block-wide reduction algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements - typename ScalingType = MemBoundScaling > -struct AgentReducePolicy : - ScalingType +template > +struct AgentReducePolicy : ScalingType { - enum - { - VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load - }; - - static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; ///< Cooperative block-wide reduction algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements -}; + /// Number of items per vectorized load + static constexpr int VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH; + /// Cooperative block-wide reduction algorithm to use + static constexpr BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; + /// Cache load modifier for reading input elements + static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; +}; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** - * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction . + * @brief AgentReduce implements a stateful abstraction of CUDA thread blocks + * for participating in device-wide reduction . + * + * Each thread reduces only the values it loads. If `FIRST_TILE`, this partial + * reduction is stored into `thread_aggregate`. Otherwise it is accumulated + * into `thread_aggregate`. + * + * @tparam AgentReducePolicy + * Parameterized AgentReducePolicy tuning policy type + * + * @tparam InputIteratorT + * Random-access iterator type for input + * + * @tparam OutputIteratorT + * Random-access iterator type for output + * + * @tparam OffsetT + * Signed integer type for global offsets * - * Each thread reduces only the values it loads. If \p FIRST_TILE, this - * partial reduction is stored into \p thread_aggregate. Otherwise it is - * accumulated into \p thread_aggregate. + * @tparam ReductionOp + * Binary reduction operator type having member + * `auto operator()(T &&a, U &&b)` + * + * @tparam AccumT + * The type of intermediate accumulator (according to P2322R6) */ -template < - typename AgentReducePolicy, ///< Parameterized AgentReducePolicy tuning policy type - typename InputIteratorT, ///< Random-access iterator type for input - typename OutputIteratorT, ///< Random-access iterator type for output - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOp> ///< Binary reduction operator type having member T operator()(const T &a, const T &b) +template struct AgentReduce { - - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// The input value type - using InputT = cub::detail::value_t; - - /// The output value type - using OutputT = cub::detail::non_void_value_t; - - /// Vector type of InputT for data movement - using VectorT = - typename CubVector::Type; - - /// Input iterator wrapper type (for applying cache modifier) - // Wrap the native input pointer with CacheModifiedInputIterator - // or directly use the supplied input iterator type - using WrappedInputIteratorT = cub::detail::conditional_t< - std::is_pointer::value, - CacheModifiedInputIterator, - InputIteratorT>; - - /// Constants + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// The input value type + using InputT = cub::detail::value_t; + + /// Vector type of InputT for data movement + using VectorT = + typename CubVector::Type; + + /// Input iterator wrapper type (for applying cache modifier) + // Wrap the native input pointer with CacheModifiedInputIterator + // or directly use the supplied input iterator type + using WrappedInputIteratorT = cub::detail::conditional_t< + std::is_pointer::value, + CacheModifiedInputIterator, + InputIteratorT>; + + /// Constants + static constexpr int BLOCK_THREADS = AgentReducePolicy::BLOCK_THREADS; + static constexpr int ITEMS_PER_THREAD = AgentReducePolicy::ITEMS_PER_THREAD; + static constexpr int TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD; + static constexpr int VECTOR_LOAD_LENGTH = + CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH); + + // Can vectorize according to the policy if the input iterator is a native + // pointer to a primitive type + static constexpr bool ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) && + (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) && + (std::is_pointer::value) && + Traits::PRIMITIVE; + + static constexpr CacheLoadModifier LOAD_MODIFIER = + AgentReducePolicy::LOAD_MODIFIER; + + static constexpr BlockReduceAlgorithm BLOCK_ALGORITHM = + AgentReducePolicy::BLOCK_ALGORITHM; + + /// Parameterized BlockReduce primitive + using BlockReduceT = + BlockReduce; + + /// Shared memory type required by this thread block + struct _TempStorage + { + typename BlockReduceT::TempStorage reduce; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> + {}; + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage &temp_storage; ///< Reference to temp_storage + InputIteratorT d_in; ///< Input data to reduce + WrappedInputIteratorT d_wrapped_in; ///< Wrapped input data to reduce + ReductionOp reduction_op; ///< Binary reduction operator + + //--------------------------------------------------------------------- + // Utility + //--------------------------------------------------------------------- + + // Whether or not the input is aligned with the vector type (specialized for + // types we can vectorize) + template + static __device__ __forceinline__ bool + IsAligned(Iterator d_in, Int2Type /*can_vectorize*/) + { + return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0; + } + + // Whether or not the input is aligned with the vector type (specialized for + // types we cannot vectorize) + template + static __device__ __forceinline__ bool + IsAligned(Iterator /*d_in*/, Int2Type /*can_vectorize*/) + { + return false; + } + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + /** + * @brief Constructor + * @param temp_storage Reference to temp_storage + * @param d_in Input data to reduce + * @param reduction_op Binary reduction operator + */ + __device__ __forceinline__ AgentReduce(TempStorage &temp_storage, + InputIteratorT d_in, + ReductionOp reduction_op) + : temp_storage(temp_storage.Alias()) + , d_in(d_in) + , d_wrapped_in(d_in) + , reduction_op(reduction_op) + {} + + //--------------------------------------------------------------------- + // Tile consumption + //--------------------------------------------------------------------- + + /** + * @brief Consume a full tile of input (non-vectorized) + * @param block_offset The offset the tile to consume + * @param valid_items The number of valid items in the tile + * @param is_full_tile Whether or not this is a full tile + * @param can_vectorize Whether or not we can vectorize loads + */ + template + __device__ __forceinline__ void ConsumeTile(AccumT &thread_aggregate, + OffsetT block_offset, + int /*valid_items*/, + Int2Type /*is_full_tile*/, + Int2Type /*can_vectorize*/) + { + AccumT items[ITEMS_PER_THREAD]; + + // Load items in striped fashion + LoadDirectStriped(threadIdx.x, + d_wrapped_in + block_offset, + items); + + // Reduce items within each thread stripe + thread_aggregate = + (IS_FIRST_TILE) + ? internal::ThreadReduce(items, reduction_op) + : internal::ThreadReduce(items, reduction_op, thread_aggregate); + } + + /** + * Consume a full tile of input (vectorized) + * @param block_offset The offset the tile to consume + * @param valid_items The number of valid items in the tile + * @param is_full_tile Whether or not this is a full tile + * @param can_vectorize Whether or not we can vectorize loads + */ + template + __device__ __forceinline__ void ConsumeTile(AccumT &thread_aggregate, + OffsetT block_offset, + int /*valid_items*/, + Int2Type /*is_full_tile*/, + Int2Type /*can_vectorize*/) + { + // Alias items as an array of VectorT and load it in striped fashion enum { - BLOCK_THREADS = AgentReducePolicy::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentReducePolicy::ITEMS_PER_THREAD, - VECTOR_LOAD_LENGTH = CUB_MIN(int(ITEMS_PER_THREAD), int(AgentReducePolicy::VECTOR_LOAD_LENGTH)), - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - - // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type - ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) && - (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) && - (std::is_pointer::value) && Traits::PRIMITIVE, - + WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH }; - static const CacheLoadModifier LOAD_MODIFIER = AgentReducePolicy::LOAD_MODIFIER; - static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM; - - /// Parameterized BlockReduce primitive - using BlockReduceT = - BlockReduce; - - /// Shared memory type required by this thread block - struct _TempStorage + // Fabricate a vectorized input iterator + InputT *d_in_unqualified = const_cast(d_in) + block_offset + + (threadIdx.x * VECTOR_LOAD_LENGTH); + CacheModifiedInputIterator + d_vec_in(reinterpret_cast(d_in_unqualified)); + + // Load items as vector items + InputT input_items[ITEMS_PER_THREAD]; + VectorT *vec_items = reinterpret_cast(input_items); +#pragma unroll + for (int i = 0; i < WORDS; ++i) { - typename BlockReduceT::TempStorage reduce; - }; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - _TempStorage& temp_storage; ///< Reference to temp_storage - InputIteratorT d_in; ///< Input data to reduce - WrappedInputIteratorT d_wrapped_in; ///< Wrapped input data to reduce - ReductionOp reduction_op; ///< Binary reduction operator - - - //--------------------------------------------------------------------- - // Utility - //--------------------------------------------------------------------- - - - // Whether or not the input is aligned with the vector type (specialized for types we can vectorize) - template - static __device__ __forceinline__ bool IsAligned( - Iterator d_in, - Int2Type /*can_vectorize*/) - { - return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0; + vec_items[i] = d_vec_in[BLOCK_THREADS * i]; } - // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize) - template - static __device__ __forceinline__ bool IsAligned( - Iterator /*d_in*/, - Int2Type /*can_vectorize*/) + // Convert from input type to output type + AccumT items[ITEMS_PER_THREAD]; +#pragma unroll + for (int i = 0; i < ITEMS_PER_THREAD; ++i) { - return false; + items[i] = input_items[i]; } - - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - /** - * Constructor - */ - __device__ __forceinline__ AgentReduce( - TempStorage& temp_storage, ///< Reference to temp_storage - InputIteratorT d_in, ///< Input data to reduce - ReductionOp reduction_op) ///< Binary reduction operator - : - temp_storage(temp_storage.Alias()), - d_in(d_in), - d_wrapped_in(d_in), - reduction_op(reduction_op) - {} - - - //--------------------------------------------------------------------- - // Tile consumption - //--------------------------------------------------------------------- - - /** - * Consume a full tile of input (non-vectorized) - */ - template - __device__ __forceinline__ void ConsumeTile( - OutputT &thread_aggregate, - OffsetT block_offset, ///< The offset the tile to consume - int /*valid_items*/, ///< The number of valid items in the tile - Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile - Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads + // Reduce items within each thread stripe + thread_aggregate = + (IS_FIRST_TILE) + ? internal::ThreadReduce(items, reduction_op) + : internal::ThreadReduce(items, reduction_op, thread_aggregate); + } + + /** + * Consume a partial tile of input + * @param block_offset The offset the tile to consume + * @param valid_items The number of valid items in the tile + * @param is_full_tile Whether or not this is a full tile + * @param can_vectorize Whether or not we can vectorize loads + */ + template + __device__ __forceinline__ void + ConsumeTile(AccumT &thread_aggregate, + OffsetT block_offset, + int valid_items, + Int2Type /*is_full_tile*/, + Int2Type /*can_vectorize*/) + { + // Partial tile + int thread_offset = threadIdx.x; + + // Read first item + if ((IS_FIRST_TILE) && (thread_offset < valid_items)) { - OutputT items[ITEMS_PER_THREAD]; - - // Load items in striped fashion - LoadDirectStriped(threadIdx.x, d_wrapped_in + block_offset, items); - - // Reduce items within each thread stripe - thread_aggregate = (IS_FIRST_TILE) ? - internal::ThreadReduce(items, reduction_op) : - internal::ThreadReduce(items, reduction_op, thread_aggregate); + thread_aggregate = d_wrapped_in[block_offset + thread_offset]; + thread_offset += BLOCK_THREADS; } - - /** - * Consume a full tile of input (vectorized) - */ - template - __device__ __forceinline__ void ConsumeTile( - OutputT &thread_aggregate, - OffsetT block_offset, ///< The offset the tile to consume - int /*valid_items*/, ///< The number of valid items in the tile - Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile - Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads + // Continue reading items (block-striped) + while (thread_offset < valid_items) { - // Alias items as an array of VectorT and load it in striped fashion - enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH }; - - // Fabricate a vectorized input iterator - InputT *d_in_unqualified = const_cast(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH); - CacheModifiedInputIterator d_vec_in( - reinterpret_cast(d_in_unqualified)); - - // Load items as vector items - InputT input_items[ITEMS_PER_THREAD]; - VectorT *vec_items = reinterpret_cast(input_items); - #pragma unroll - for (int i = 0; i < WORDS; ++i) - vec_items[i] = d_vec_in[BLOCK_THREADS * i]; - - // Convert from input type to output type - OutputT items[ITEMS_PER_THREAD]; - #pragma unroll - for (int i = 0; i < ITEMS_PER_THREAD; ++i) - items[i] = input_items[i]; - - // Reduce items within each thread stripe - thread_aggregate = (IS_FIRST_TILE) ? - internal::ThreadReduce(items, reduction_op) : - internal::ThreadReduce(items, reduction_op, thread_aggregate); - } - + InputT item(d_wrapped_in[block_offset + thread_offset]); - /** - * Consume a partial tile of input - */ - template - __device__ __forceinline__ void ConsumeTile( - OutputT &thread_aggregate, - OffsetT block_offset, ///< The offset the tile to consume - int valid_items, ///< The number of valid items in the tile - Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile - Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads - { - // Partial tile - int thread_offset = threadIdx.x; - - // Read first item - if ((IS_FIRST_TILE) && (thread_offset < valid_items)) - { - thread_aggregate = d_wrapped_in[block_offset + thread_offset]; - thread_offset += BLOCK_THREADS; - } - - // Continue reading items (block-striped) - while (thread_offset < valid_items) - { - OutputT item (d_wrapped_in[block_offset + thread_offset]); - thread_aggregate = reduction_op(thread_aggregate, item); - thread_offset += BLOCK_THREADS; - } + thread_aggregate = reduction_op(thread_aggregate, item); + thread_offset += BLOCK_THREADS; } - - - //--------------------------------------------------------------- - // Consume a contiguous segment of tiles - //--------------------------------------------------------------------- - - /** - * \brief Reduce a contiguous segment of input tiles - */ - template - __device__ __forceinline__ OutputT ConsumeRange( - GridEvenShare &even_share, ///< GridEvenShare descriptor - Int2Type can_vectorize) ///< Whether or not we can vectorize loads + } + + //--------------------------------------------------------------- + // Consume a contiguous segment of tiles + //--------------------------------------------------------------------- + + /** + * @brief Reduce a contiguous segment of input tiles + * @param even_share GridEvenShare descriptor + * @param can_vectorize Whether or not we can vectorize loads + */ + template + __device__ __forceinline__ AccumT + ConsumeRange(GridEvenShare &even_share, + Int2Type can_vectorize) + { + AccumT thread_aggregate{}; + + if (even_share.block_offset + TILE_ITEMS > even_share.block_end) { - OutputT thread_aggregate; - - if (even_share.block_offset + TILE_ITEMS > even_share.block_end) - { - // First tile isn't full (not all threads have valid items) - int valid_items = even_share.block_end - even_share.block_offset; - ConsumeTile(thread_aggregate, even_share.block_offset, valid_items, Int2Type(), can_vectorize); - return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items); - } - - // At least one full block - ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize); - even_share.block_offset += even_share.block_stride; - - // Consume subsequent full tiles of input - while (even_share.block_offset + TILE_ITEMS <= even_share.block_end) - { - ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize); - even_share.block_offset += even_share.block_stride; - } - - // Consume a partially-full tile - if (even_share.block_offset < even_share.block_end) - { - int valid_items = even_share.block_end - even_share.block_offset; - ConsumeTile(thread_aggregate, even_share.block_offset, valid_items, Int2Type(), can_vectorize); - } - - // Compute block-wide reduction (all threads have valid items) - return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op); + // First tile isn't full (not all threads have valid items) + int valid_items = even_share.block_end - even_share.block_offset; + ConsumeTile(thread_aggregate, + even_share.block_offset, + valid_items, + Int2Type(), + can_vectorize); + return BlockReduceT(temp_storage.reduce) + .Reduce(thread_aggregate, reduction_op, valid_items); } + // At least one full block + ConsumeTile(thread_aggregate, + even_share.block_offset, + TILE_ITEMS, + Int2Type(), + can_vectorize); + even_share.block_offset += even_share.block_stride; - /** - * \brief Reduce a contiguous segment of input tiles - */ - __device__ __forceinline__ OutputT ConsumeRange( - OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive) - OffsetT block_end) ///< [in] Threadblock end offset (exclusive) + // Consume subsequent full tiles of input + while (even_share.block_offset + TILE_ITEMS <= even_share.block_end) { - GridEvenShare even_share; - even_share.template BlockInit(block_offset, block_end); - - return (IsAligned(d_in + block_offset, Int2Type())) ? - ConsumeRange(even_share, Int2Type()) : - ConsumeRange(even_share, Int2Type()); + ConsumeTile(thread_aggregate, + even_share.block_offset, + TILE_ITEMS, + Int2Type(), + can_vectorize); + even_share.block_offset += even_share.block_stride; } - - /** - * Reduce a contiguous segment of input tiles - */ - __device__ __forceinline__ OutputT ConsumeTiles( - GridEvenShare &even_share) ///< [in] GridEvenShare descriptor + // Consume a partially-full tile + if (even_share.block_offset < even_share.block_end) { - // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block - even_share.template BlockInit(); - - return (IsAligned(d_in, Int2Type())) ? - ConsumeRange(even_share, Int2Type()) : - ConsumeRange(even_share, Int2Type()); - + int valid_items = even_share.block_end - even_share.block_offset; + ConsumeTile(thread_aggregate, + even_share.block_offset, + valid_items, + Int2Type(), + can_vectorize); } + // Compute block-wide reduction (all threads have valid items) + return BlockReduceT(temp_storage.reduce) + .Reduce(thread_aggregate, reduction_op); + } + + /** + * @brief Reduce a contiguous segment of input tiles + * @param[in] block_offset Threadblock begin offset (inclusive) + * @param[in] block_end Threadblock end offset (exclusive) + */ + __device__ __forceinline__ AccumT ConsumeRange(OffsetT block_offset, + OffsetT block_end) + { + GridEvenShare even_share; + even_share.template BlockInit(block_offset, block_end); + + return (IsAligned(d_in + block_offset, Int2Type())) + ? ConsumeRange(even_share, + Int2Type < true && ATTEMPT_VECTORIZATION > ()) + : ConsumeRange(even_share, + Int2Type < false && ATTEMPT_VECTORIZATION > ()); + } + + /** + * Reduce a contiguous segment of input tiles + * @param[in] even_share GridEvenShare descriptor + */ + __device__ __forceinline__ AccumT + ConsumeTiles(GridEvenShare &even_share) + { + // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread + // block + even_share.template BlockInit(); + + return (IsAligned(d_in, Int2Type())) + ? ConsumeRange(even_share, + Int2Type < true && ATTEMPT_VECTORIZATION > ()) + : ConsumeRange(even_share, + Int2Type < false && ATTEMPT_VECTORIZATION > ()); + } }; - CUB_NAMESPACE_END diff --git a/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/block/specializations/block_reduce_warp_reductions.cuh index c341c8ba55..4fec6cad1b 100644 --- a/cub/block/specializations/block_reduce_warp_reductions.cuh +++ b/cub/block/specializations/block_reduce_warp_reductions.cuh @@ -143,7 +143,7 @@ struct BlockReduceWarpReductions // Share lane aggregates if (lane_id == 0) { - temp_storage.warp_aggregates[warp_id] = warp_aggregate; + new (temp_storage.warp_aggregates + warp_id) T(warp_aggregate); } CTA_SYNC(); diff --git a/cub/detail/type_traits.cuh b/cub/detail/type_traits.cuh index 9b28e2c959..c68e4248a7 100644 --- a/cub/detail/type_traits.cuh +++ b/cub/detail/type_traits.cuh @@ -32,10 +32,10 @@ #pragma once -#include "../util_cpp_dialect.cuh" -#include "../util_namespace.cuh" +#include +#include -#include +#include CUB_NAMESPACE_BEGIN @@ -44,11 +44,15 @@ namespace detail { template using invoke_result_t = #if CUB_CPP_DIALECT < 2017 - typename std::result_of::type; + typename cuda::std::result_of::type; #else // 2017+ - std::invoke_result_t; + cuda::std::invoke_result_t; #endif +/// The type of intermediate accumulator (according to P2322R6) +template +using accumulator_t = + typename cuda::std::decay>::type; } // namespace detail CUB_NAMESPACE_END diff --git a/cub/device/device_reduce.cuh b/cub/device/device_reduce.cuh index 9f70a111a4..4b10556de3 100644 --- a/cub/device/device_reduce.cuh +++ b/cub/device/device_reduce.cuh @@ -155,7 +155,7 @@ struct DeviceReduce // Signed integer type for global offsets typedef int OffsetT; - return DispatchReduce::Dispatch( + return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_in, @@ -239,14 +239,16 @@ struct DeviceReduce cub::detail::non_void_value_t>; - return DispatchReduce::Dispatch( + using InitT = OutputT; + + return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Sum(), - OutputT(), // zero-initialize + InitT{}, // zero-initialize stream, debug_synchronous); } @@ -314,14 +316,16 @@ struct DeviceReduce // The input value type using InputT = cub::detail::value_t; - return DispatchReduce::Dispatch( + using InitT = InputT; + + return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Min(), - Traits::Max(), // replace with std::numeric_limits::max() when C++11 support is more prevalent + Traits::Max(), // replace with std::numeric_limits::max() when C++11 support is more prevalent stream, debug_synchronous); } @@ -396,6 +400,8 @@ struct DeviceReduce cub::detail::non_void_value_t>; + using InitT = OutputTupleT; + // The output value type using OutputValueT = typename OutputTupleT::Value; @@ -406,9 +412,9 @@ struct DeviceReduce ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value - OutputTupleT initial_value(1, Traits::Max()); // replace with std::numeric_limits::max() when C++11 support is more prevalent + InitT initial_value(1, Traits::Max()); // replace with std::numeric_limits::max() when C++11 support is more prevalent - return DispatchReduce::Dispatch( + return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_indexed_in, @@ -483,14 +489,16 @@ struct DeviceReduce // The input value type using InputT = cub::detail::value_t; - return DispatchReduce::Dispatch( + using InitT = InputT; + + return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Max(), - Traits::Lowest(), // replace with std::numeric_limits::lowest() when C++11 support is more prevalent + Traits::Lowest(), // replace with std::numeric_limits::lowest() when C++11 support is more prevalent stream, debug_synchronous); } @@ -568,6 +576,8 @@ struct DeviceReduce // The output value type using OutputValueT = typename OutputTupleT::Value; + using InitT = OutputTupleT; + // Wrapped input iterator to produce index-value tuples using ArgIndexInputIteratorT = ArgIndexInputIterator; @@ -575,9 +585,9 @@ struct DeviceReduce ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value - OutputTupleT initial_value(1, Traits::Lowest()); // replace with std::numeric_limits::lowest() when C++11 support is more prevalent + InitT initial_value(1, Traits::Lowest()); // replace with std::numeric_limits::lowest() when C++11 support is more prevalent - return DispatchReduce::Dispatch( + return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_indexed_in, diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh index 0ac3b7b3c0..9b0148665b 100644 --- a/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/device/dispatch/dispatch_reduce.cuh @@ -1,7 +1,6 @@ - /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -14,10 +13,10 @@ * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND @@ -28,8 +27,9 @@ ******************************************************************************/ /** - * \file - * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. + * @file cub::DeviceReduce provides device-wide, parallel operations for + * computing a reduction across a sequence of data items residing within + * device-accessible memory. */ #pragma once @@ -37,14 +37,13 @@ #include #include -#include "../../agent/agent_reduce.cuh" -#include "../../iterator/arg_index_input_iterator.cuh" -#include "../../thread/thread_operators.cuh" -#include "../../grid/grid_even_share.cuh" -#include "../../iterator/arg_index_input_iterator.cuh" -#include "../../config.cuh" -#include "../../util_debug.cuh" -#include "../../util_device.cuh" +#include +#include +#include +#include +#include +#include +#include #include @@ -55,788 +54,1247 @@ CUB_NAMESPACE_BEGIN *****************************************************************************/ /** - * Reduce region kernel entry point (multi-block). Computes privatized reductions, one per thread block. + * @brief Reduce region kernel entry point (multi-block). Computes privatized + * reductions, one per thread block. + * + * @tparam ChainedPolicyT + * Chained tuning policy + * + * @tparam InputIteratorT + * Random-access input iterator type for reading input items \iterator + * + * @tparam OffsetT + * Signed integer type for global offsets + * + * @tparam ReductionOpT + * Binary reduction functor type having member + * `auto operator()(const T &a, const U &b)` + * + * @tparam InitT + * Initial value type + * + * @tparam AccumT + * Accumulator type + * + * @param[in] d_in + * Pointer to the input sequence of data items + * + * @param[out] d_out + * Pointer to the output aggregate + * + * @param[in] num_items + * Total number of input data items + * + * @param[in] even_share + * Even-share descriptor for mapping an equal number of tiles onto each + * thread block + * + * @param[in] reduction_op + * Binary reduction functor */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) -__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) -__global__ void DeviceReduceKernel( - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - OffsetT num_items, ///< [in] Total number of input data items - GridEvenShare even_share, ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block - ReductionOpT reduction_op) ///< [in] Binary reduction functor +template +__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) +__global__ void DeviceReduceKernel(InputIteratorT d_in, + AccumT* d_out, + OffsetT num_items, + GridEvenShare even_share, + ReductionOpT reduction_op) { - // The output value type - using OutputT = - cub::detail::non_void_value_t>; - - // Thread block type for reducing input tiles - using AgentReduceT = - AgentReduce; - - // Shared memory storage - __shared__ typename AgentReduceT::TempStorage temp_storage; - - // Consume input tiles - OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share); - - // Output result - if (threadIdx.x == 0) - d_out[blockIdx.x] = block_aggregate; + // Thread block type for reducing input tiles + using AgentReduceT = + AgentReduce; + + // Shared memory storage + __shared__ typename AgentReduceT::TempStorage temp_storage; + + // Consume input tiles + AccumT block_aggregate = + AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share); + + // Output result + if (threadIdx.x == 0) + { + new (d_out + blockIdx.x) AccumT(block_aggregate); + } } - /** - * Reduce a single tile kernel entry point (single-block). Can be used to aggregate privatized thread block reductions from a previous multi-block reduction pass. + * @brief Reduce a single tile kernel entry point (single-block). Can be used + * to aggregate privatized thread block reductions from a previous + * multi-block reduction pass. + * + * @tparam ChainedPolicyT + * Chained tuning policy + * + * @tparam InputIteratorT + * Random-access input iterator type for reading input items \iterator + * + * @tparam OutputIteratorT + * Output iterator type for recording the reduced aggregate \iterator + * + * @tparam OffsetT + * Signed integer type for global offsets + * + * @tparam ReductionOpT + * Binary reduction functor type having member + * `T operator()(const T &a, const U &b)` + * + * @tparam InitT + * Initial value type + * + * @tparam AccumT + * Accumulator type + * + * @param[in] d_in + * Pointer to the input sequence of data items + * + * @param[out] d_out + * Pointer to the output aggregate + * + * @param[in] num_items + * Total number of input data items + * + * @param[in] reduction_op + * Binary reduction functor + * + * @param[in] init + * The initial value of the reduction */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) - typename OutputT> ///< Data element type that is convertible to the \p value type of \p OutputIteratorT -__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) -__global__ void DeviceReduceSingleTileKernel( - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - OffsetT num_items, ///< [in] Total number of input data items - ReductionOpT reduction_op, ///< [in] Binary reduction functor - OutputT init) ///< [in] The initial value of the reduction +template +__launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) +__global__ void DeviceReduceSingleTileKernel(InputIteratorT d_in, + OutputIteratorT d_out, + OffsetT num_items, + ReductionOpT reduction_op, + InitT init) { - // Thread block type for reducing input tiles - typedef AgentReduce< - typename ChainedPolicyT::ActivePolicy::SingleTilePolicy, - InputIteratorT, - OutputIteratorT, - OffsetT, - ReductionOpT> - AgentReduceT; - - // Shared memory storage - __shared__ typename AgentReduceT::TempStorage temp_storage; - - // Check if empty problem - if (num_items == 0) + // Thread block type for reducing input tiles + using AgentReduceT = + AgentReduce; + + // Shared memory storage + __shared__ typename AgentReduceT::TempStorage temp_storage; + + // Check if empty problem + if (num_items == 0) + { + if (threadIdx.x == 0) { - if (threadIdx.x == 0) - *d_out = init; - return; + *d_out = init; } - // Consume input tiles - OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange( - OffsetT(0), - num_items); + return; + } - // Output result - if (threadIdx.x == 0) - *d_out = reduction_op(init, block_aggregate); -} + // Consume input tiles + AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op) + .ConsumeRange(OffsetT(0), num_items); + // Output result + if (threadIdx.x == 0) + { + *d_out = reduction_op(init, block_aggregate); + } +} /// Normalize input iterator to segment offset template -__device__ __forceinline__ -void NormalizeReductionOutput( - T &/*val*/, - OffsetT /*base_offset*/, - IteratorT /*itr*/) +__device__ __forceinline__ void NormalizeReductionOutput(T & /*val*/, + OffsetT /*base_offset*/, + IteratorT /*itr*/) {} - /// Normalize input iterator to segment offset (specialized for arg-index) -template -__device__ __forceinline__ -void NormalizeReductionOutput( - KeyValuePairT &val, - OffsetT base_offset, - ArgIndexInputIterator /*itr*/) +template +__device__ __forceinline__ void NormalizeReductionOutput( + KeyValuePairT &val, + OffsetT base_offset, + ArgIndexInputIterator /*itr*/) { - val.key -= base_offset; + val.key -= base_offset; } - /** * Segmented reduction (one block per segment) + * @tparam ChainedPolicyT + * Chained tuning policy + * + * @tparam InputIteratorT + * Random-access input iterator type for reading input items \iterator + * + * @tparam OutputIteratorT + * Output iterator type for recording the reduced aggregate \iterator + * + * @tparam BeginOffsetIteratorT + * Random-access input iterator type for reading segment beginning offsets + * \iterator + * + * @tparam EndOffsetIteratorT + * Random-access input iterator type for reading segment ending offsets + * \iterator + * + * @tparam OffsetT + * Signed integer type for global offsets + * + * @tparam ReductionOpT + * Binary reduction functor type having member + * `T operator()(const T &a, const U &b)` + * + * @tparam InitT + * Initial value type + * + * @param[in] d_in + * Pointer to the input sequence of data items + * + * @param[out] d_out + * Pointer to the output aggregate + * + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first element + * of the *i*th data segment in `d_keys_*` and `d_values_*` + * + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is + * considered empty. + * + * @param[in] num_segments + * The number of segments that comprise the sorting data + * + * @param[in] reduction_op + * Binary reduction functor + * + * @param[in] init + * The initial value of the reduction */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator - typename BeginOffsetIteratorT, ///< Random-access input iterator type for reading segment beginning offsets \iterator - typename EndOffsetIteratorT, ///< Random-access input iterator type for reading segment ending offsets \iterator - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) - typename OutputT> ///< Data element type that is convertible to the \p value type of \p OutputIteratorT -__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) +template +__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) __global__ void DeviceSegmentedReduceKernel( - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int /*num_segments*/, ///< [in] The number of segments that comprise the sorting data - ReductionOpT reduction_op, ///< [in] Binary reduction functor - OutputT init) ///< [in] The initial value of the reduction + InputIteratorT d_in, + OutputIteratorT d_out, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + int /*num_segments*/, + ReductionOpT reduction_op, + InitT init) { - // Thread block type for reducing input tiles - typedef AgentReduce< - typename ChainedPolicyT::ActivePolicy::ReducePolicy, - InputIteratorT, - OutputIteratorT, - OffsetT, - ReductionOpT> - AgentReduceT; - - // Shared memory storage - __shared__ typename AgentReduceT::TempStorage temp_storage; - - OffsetT segment_begin = d_begin_offsets[blockIdx.x]; - OffsetT segment_end = d_end_offsets[blockIdx.x]; - - // Check if empty problem - if (segment_begin == segment_end) + // Thread block type for reducing input tiles + using AgentReduceT = + AgentReduce; + + // Shared memory storage + __shared__ typename AgentReduceT::TempStorage temp_storage; + + OffsetT segment_begin = d_begin_offsets[blockIdx.x]; + OffsetT segment_end = d_end_offsets[blockIdx.x]; + + // Check if empty problem + if (segment_begin == segment_end) + { + if (threadIdx.x == 0) { - if (threadIdx.x == 0) - d_out[blockIdx.x] = init; - return; + d_out[blockIdx.x] = init; } + return; + } - // Consume input tiles - OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange( - segment_begin, - segment_end); + // Consume input tiles + AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op) + .ConsumeRange(segment_begin, segment_end); - // Normalize as needed - NormalizeReductionOutput(block_aggregate, segment_begin, d_in); + // Normalize as needed + NormalizeReductionOutput(block_aggregate, segment_begin, d_in); - if (threadIdx.x == 0) - d_out[blockIdx.x] = reduction_op(init, block_aggregate);; + if (threadIdx.x == 0) + { + d_out[blockIdx.x] = reduction_op(init, block_aggregate); + } } - - - /****************************************************************************** * Policy ******************************************************************************/ +/** + * @tparam AccumT + * Accumulator data type + * + * OffsetT + * Signed integer type for global offsets + * + * ReductionOpT + * Binary reduction functor type having member + * `auto operator()(const T &a, const U &b)` + */ template < - typename InputT, ///< Input data type - typename OutputT, ///< Compute/output data type - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) + typename AccumT, + typename OffsetT, + typename ReductionOpT> struct DeviceReducePolicy { - //------------------------------------------------------------------------------ - // Architecture-specific tuning policies - //------------------------------------------------------------------------------ - - /// SM30 - struct Policy300 : ChainedPolicy<300, Policy300, Policy300> - { - // ReducePolicy (GTX670: 154.0 @ 48M 4B items) - typedef AgentReducePolicy< - 256, 20, InputT, ///< Threads per block, items per thread, compute type, compute type - 2, ///< Number of items per vectorized load - BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use - LOAD_DEFAULT> ///< Cache load modifier - ReducePolicy; - - // SingleTilePolicy - typedef ReducePolicy SingleTilePolicy; - - // SegmentedReducePolicy - typedef ReducePolicy SegmentedReducePolicy; - }; - - - /// SM35 - struct Policy350 : ChainedPolicy<350, Policy350, Policy300> - { - // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items) - typedef AgentReducePolicy< - 256, 20, InputT, ///< Threads per block, items per thread, compute type - 4, ///< Number of items per vectorized load - BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use - LOAD_LDG> ///< Cache load modifier - ReducePolicy; - - // SingleTilePolicy - typedef ReducePolicy SingleTilePolicy; - - // SegmentedReducePolicy - typedef ReducePolicy SegmentedReducePolicy; - }; - - /// SM60 - struct Policy600 : ChainedPolicy<600, Policy600, Policy350> - { - // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) - typedef AgentReducePolicy< - 256, 16, InputT, ///< Threads per block, items per thread, compute type - 4, ///< Number of items per vectorized load - BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use - LOAD_LDG> ///< Cache load modifier - ReducePolicy; - - // SingleTilePolicy - typedef ReducePolicy SingleTilePolicy; - - // SegmentedReducePolicy - typedef ReducePolicy SegmentedReducePolicy; - }; - - - /// MaxPolicy - typedef Policy600 MaxPolicy; - + //--------------------------------------------------------------------------- + // Architecture-specific tuning policies + //--------------------------------------------------------------------------- + + /// SM30 + struct Policy300 : ChainedPolicy<300, Policy300, Policy300> + { + static constexpr int threads_per_block = 256; + static constexpr int items_per_thread = 20; + static constexpr int items_per_vec_load = 2; + + // ReducePolicy (GTX670: 154.0 @ 48M 4B items) + using ReducePolicy = AgentReducePolicy; + + // SingleTilePolicy + using SingleTilePolicy = ReducePolicy; + + // SegmentedReducePolicy + using SegmentedReducePolicy = ReducePolicy; + }; + + /// SM35 + struct Policy350 : ChainedPolicy<350, Policy350, Policy300> + { + static constexpr int threads_per_block = 256; + static constexpr int items_per_thread = 20; + static constexpr int items_per_vec_load = 4; + + // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B + // items) + using ReducePolicy = AgentReducePolicy; + + // SingleTilePolicy + using SingleTilePolicy = ReducePolicy; + + // SegmentedReducePolicy + using SegmentedReducePolicy = ReducePolicy; + }; + + /// SM60 + struct Policy600 : ChainedPolicy<600, Policy600, Policy350> + { + static constexpr int threads_per_block = 256; + static constexpr int items_per_thread = 16; + static constexpr int items_per_vec_load = 4; + + // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) + using ReducePolicy = AgentReducePolicy; + + // SingleTilePolicy + using SingleTilePolicy = ReducePolicy; + + // SegmentedReducePolicy + using SegmentedReducePolicy = ReducePolicy; + }; + + using MaxPolicy = Policy600; }; /****************************************************************************** * Single-problem dispatch - ******************************************************************************/ + *****************************************************************************/ /** - * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction + * @brief Utility class for dispatching the appropriately-tuned kernels for + * device-wide reduction + * + * @tparam InputIteratorT + * Random-access input iterator type for reading input items \iterator + * + * @tparam OutputIteratorT + * Output iterator type for recording the reduced aggregate \iterator + * + * @tparam OffsetT + * Signed integer type for global offsets + * + * @tparam ReductionOpT + * Binary reduction functor type having member + * `auto operator()(const T &a, const U &b)` + * + * @tparam InitT + * Initial value type */ template < - typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) - typename OutputT = ///< Data type of the output iterator - cub::detail::non_void_value_t< - OutputIteratorT, - cub::detail::value_t>, - typename SelectedPolicy = DeviceReducePolicy< - cub::detail::value_t, - OutputT, - OffsetT, - ReductionOpT> > -struct DispatchReduce : - SelectedPolicy + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetT, + typename ReductionOpT, + typename InitT = + cub::detail::non_void_value_t< + OutputIteratorT, + cub::detail::value_t>, + typename AccumT = + detail::accumulator_t< + ReductionOpT, + InitT, + cub::detail::value_t>, + typename SelectedPolicy = DeviceReducePolicy> +struct DispatchReduce : SelectedPolicy { - //------------------------------------------------------------------------------ - // Problem state - //------------------------------------------------------------------------------ - - void *d_temp_storage; ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in; ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out; ///< [out] Pointer to the output aggregate - OffsetT num_items; ///< [in] Total number of input items (i.e., length of \p d_in) - ReductionOpT reduction_op; ///< [in] Binary reduction functor - OutputT init; ///< [in] The initial value of the reduction - cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int ptx_version; ///< [in] PTX version - - //------------------------------------------------------------------------------ - // Constructor - //------------------------------------------------------------------------------ - - /// Constructor - CUB_RUNTIME_FUNCTION __forceinline__ - DispatchReduce( - void* d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - OffsetT num_items, - ReductionOpT reduction_op, - OutputT init, - cudaStream_t stream, - bool debug_synchronous, - int ptx_version) - : - d_temp_storage(d_temp_storage), - temp_storage_bytes(temp_storage_bytes), - d_in(d_in), - d_out(d_out), - num_items(num_items), - reduction_op(reduction_op), - init(init), - stream(stream), - debug_synchronous(debug_synchronous), - ptx_version(ptx_version) - {} - - - //------------------------------------------------------------------------------ - // Small-problem (single tile) invocation - //------------------------------------------------------------------------------ - - /// Invoke a single block block to reduce in-core - template < - typename ActivePolicyT, ///< Umbrella policy active for the target device - typename SingleTileKernelT> ///< Function type of cub::DeviceReduceSingleTileKernel - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokeSingleTile( - SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel - { + //--------------------------------------------------------------------------- + // Problem state + //--------------------------------------------------------------------------- + + /// Device-accessible allocation of temporary storage. When `nullptr`, the + /// required allocation size is written to `temp_storage_bytes` and no work + /// is done. + void *d_temp_storage; + + /// Reference to size in bytes of `d_temp_storage` allocation + size_t &temp_storage_bytes; + + /// Pointer to the input sequence of data items + InputIteratorT d_in; + + /// Pointer to the output aggregate + OutputIteratorT d_out; + + /// Total number of input items (i.e., length of `d_in`) + OffsetT num_items; + + /// Binary reduction functor + ReductionOpT reduction_op; + + /// The initial value of the reduction + InitT init; + + /// CUDA stream to launch kernels within. Default is stream0. + cudaStream_t stream; + + /// Whether or not to synchronize the stream after every kernel launch to + /// check for errors. Also causes launch configurations to be printed to the + /// console. Default is `false`. + bool debug_synchronous; + + int ptx_version; + + //--------------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------------- + + /// Constructor + CUB_RUNTIME_FUNCTION __forceinline__ DispatchReduce(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + OffsetT num_items, + ReductionOpT reduction_op, + InitT init, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version) + : d_temp_storage(d_temp_storage) + , temp_storage_bytes(temp_storage_bytes) + , d_in(d_in) + , d_out(d_out) + , num_items(num_items) + , reduction_op(reduction_op) + , init(init) + , stream(stream) + , debug_synchronous(debug_synchronous) + , ptx_version(ptx_version) + {} + + //--------------------------------------------------------------------------- + // Small-problem (single tile) invocation + //--------------------------------------------------------------------------- + + /** + * @brief Invoke a single block block to reduce in-core + * + * @tparam ActivePolicyT + * Umbrella policy active for the target device + * + * @tparam SingleTileKernelT + * Function type of cub::DeviceReduceSingleTileKernel + * + * @param[in] single_tile_kernel + * Kernel function pointer to parameterization of + * cub::DeviceReduceSingleTileKernel + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t + InvokeSingleTile(SingleTileKernelT single_tile_kernel) + { #ifndef CUB_RUNTIME_ENABLED - (void)single_tile_kernel; + (void)single_tile_kernel; - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); #else - cudaError error = cudaSuccess; - do - { - // Return if the caller is simply requesting the size of the storage allocation - if (d_temp_storage == NULL) - { - temp_storage_bytes = 1; - break; - } - - // Log single_reduce_sweep_kernel configuration - if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n", + cudaError error = cudaSuccess; + do + { + // Return if the caller is simply requesting the size of the storage + // allocation + if (d_temp_storage == NULL) + { + temp_storage_bytes = 1; + break; + } + + // Log single_reduce_sweep_kernel configuration + if (debug_synchronous) + { + _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), " + "%d items per thread\n", ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, - (long long) stream, + (long long)stream, ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); + } - // Invoke single_reduce_sweep_kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( - 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream - ).doit(single_tile_kernel, - d_in, - d_out, - num_items, - reduction_op, - init); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - while (0); + // Invoke single_reduce_sweep_kernel + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream) + .doit(single_tile_kernel, d_in, d_out, num_items, reduction_op, init); - return error; + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } -#endif // CUB_RUNTIME_ENABLED - } + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) + { + break; + } + } while (0); + return error; - //------------------------------------------------------------------------------ - // Normal problem size invocation (two-pass) - //------------------------------------------------------------------------------ - - /// Invoke two-passes to reduce - template < - typename ActivePolicyT, ///< Umbrella policy active for the target device - typename ReduceKernelT, ///< Function type of cub::DeviceReduceKernel - typename SingleTileKernelT> ///< Function type of cub::DeviceReduceSingleTileKernel - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokePasses( - ReduceKernelT reduce_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel - SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel - { +#endif // CUB_RUNTIME_ENABLED + } + + //--------------------------------------------------------------------------- + // Normal problem size invocation (two-pass) + //--------------------------------------------------------------------------- + + /** + * @brief Invoke two-passes to reduce + * @tparam ActivePolicyT + * Umbrella policy active for the target device + * + * @tparam ReduceKernelT + * Function type of cub::DeviceReduceKernel + * + * @tparam SingleTileKernelT + * Function type of cub::DeviceReduceSingleTileKernel + * + * @param[in] reduce_kernel + * Kernel function pointer to parameterization of cub::DeviceReduceKernel + * + * @param[in] single_tile_kernel + * Kernel function pointer to parameterization of + * cub::DeviceReduceSingleTileKernel + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t + InvokePasses(ReduceKernelT reduce_kernel, + SingleTileKernelT single_tile_kernel) + { #ifndef CUB_RUNTIME_ENABLED - (void) reduce_kernel; - (void) single_tile_kernel; + (void)reduce_kernel; + (void)single_tile_kernel; - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); #else - cudaError error = cudaSuccess; - do - { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Init regular kernel configuration - KernelConfig reduce_config; - if (CubDebug(error = reduce_config.Init(reduce_kernel))) break; - int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count; - - // Even-share work distribution - int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(0); - GridEvenShare even_share; - even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size); - - // Temporary storage allocation requirements - void* allocations[1] = {}; - size_t allocation_sizes[1] = - { - max_blocks * sizeof(OutputT) // bytes needed for privatized block reductions - }; - - // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - return cudaSuccess; - } - - // Alias the allocation for the privatized per-block reductions - OutputT *d_block_reductions = (OutputT*) allocations[0]; - - // Get grid size for device_reduce_sweep_kernel - int reduce_grid_size = even_share.grid_size; - - // Log device_reduce_sweep_kernel configuration - if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) + break; + + // Get SM count + int sm_count; + if (CubDebug( + error = cudaDeviceGetAttribute(&sm_count, + cudaDevAttrMultiProcessorCount, + device_ordinal))) + { + break; + } + + // Init regular kernel configuration + KernelConfig reduce_config; + if (CubDebug( + error = reduce_config.Init( + reduce_kernel))) + { + break; + } + + int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count; + + // Even-share work distribution + int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(0); + GridEvenShare even_share; + even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size); + + // Temporary storage allocation requirements + void *allocations[1] = {}; + size_t allocation_sizes[1] = { + max_blocks * sizeof(AccumT) // bytes needed for privatized block + // reductions + }; + + // Alias the temporary allocations from the single storage blob (or + // compute the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, + temp_storage_bytes, + allocations, + allocation_sizes))) + { + break; + } + + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage + // allocation + return cudaSuccess; + } + + // Alias the allocation for the privatized per-block reductions + AccumT *d_block_reductions = (AccumT *)allocations[0]; + + // Get grid size for device_reduce_sweep_kernel + int reduce_grid_size = even_share.grid_size; + + // Log device_reduce_sweep_kernel configuration + if (debug_synchronous) + { + _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items " + "per thread, %d SM occupancy\n", reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, - (long long) stream, + (long long)stream, ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD, reduce_config.sm_occupancy); + } + + // Invoke DeviceReduceKernel + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + reduce_grid_size, + ActivePolicyT::ReducePolicy::BLOCK_THREADS, + 0, + stream) + .doit(reduce_kernel, + d_in, + d_block_reductions, + num_items, + even_share, + reduction_op); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous) + { + if (CubDebug(error = SyncStream(stream))) + { + break; + } + } - // Invoke DeviceReduceKernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( - reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, - 0, stream - ).doit(reduce_kernel, - d_in, - d_block_reductions, - num_items, - even_share, - reduction_op); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Log single_reduce_sweep_kernel configuration - if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n", + // Log single_reduce_sweep_kernel configuration + if (debug_synchronous) + { + _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), " + "%d items per thread\n", ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, - (long long) stream, + (long long)stream, ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); - - // Invoke DeviceReduceSingleTileKernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( - 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream - ).doit(single_tile_kernel, - d_block_reductions, - d_out, - reduce_grid_size, - reduction_op, - init); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + + // Invoke DeviceReduceSingleTileKernel + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + 1, + ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, + 0, + stream) + .doit(single_tile_kernel, + d_block_reductions, + d_out, + reduce_grid_size, + reduction_op, + init); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous) + { + if (CubDebug(error = SyncStream(stream))) + { + break; } - while (0); + } + } while (0); - return error; + return error; #endif // CUB_RUNTIME_ENABLED - - } - - - //------------------------------------------------------------------------------ - // Chained policy invocation - //------------------------------------------------------------------------------ - - /// Invocation - template - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t Invoke() + } + + //--------------------------------------------------------------------------- + // Chained policy invocation + //--------------------------------------------------------------------------- + + /// Invocation + template + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke() + { + typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT; + typedef typename DispatchReduce::MaxPolicy MaxPolicyT; + + // Force kernel code-generation in all compiler passes + if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * + SingleTilePolicyT::ITEMS_PER_THREAD)) { - typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT; - typedef typename DispatchReduce::MaxPolicy MaxPolicyT; - - // Force kernel code-generation in all compiler passes - if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD)) - { - // Small, single tile size - return InvokeSingleTile( - DeviceReduceSingleTileKernel); - } - else - { - // Regular size - return InvokePasses( - DeviceReduceKernel, - DeviceReduceSingleTileKernel); - } + // Small, single tile size + return InvokeSingleTile( + DeviceReduceSingleTileKernel); } - - - //------------------------------------------------------------------------------ - // Dispatch entrypoints - //------------------------------------------------------------------------------ - - /** - * Internal dispatch routine for computing a device-wide reduction - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - ReductionOpT reduction_op, ///< [in] Binary reduction functor - OutputT init, ///< [in] The initial value of the reduction - cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + else { - typedef typename DispatchReduce::MaxPolicy MaxPolicyT; - - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) break; - - // Create dispatch functor - DispatchReduce dispatch( - d_temp_storage, temp_storage_bytes, - d_in, d_out, num_items, reduction_op, init, - stream, debug_synchronous, ptx_version); - - // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; - } - while (0); - - return error; + // Regular size + return InvokePasses( + DeviceReduceKernel, + DeviceReduceSingleTileKernel); } + } + + //--------------------------------------------------------------------------- + // Dispatch entrypoints + //--------------------------------------------------------------------------- + + /** + * @brief Internal dispatch routine for computing a device-wide reduction + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work + * is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_in + * Pointer to the input sequence of data items + * + * @param[out] d_out + * Pointer to the output aggregate + * + * @param[in] num_items + * Total number of input items (i.e., length of `d_in`) + * + * @param[in] reduction_op + * Binary reduction functor + * + * @param[in] init + * The initial value of the reduction + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. Also causes launch configurations to + * be printed to the console. Default is `false`. + */ + CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t + Dispatch(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + OffsetT num_items, + ReductionOpT reduction_op, + InitT init, + cudaStream_t stream, + bool debug_synchronous) + { + typedef typename DispatchReduce::MaxPolicy MaxPolicyT; + + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version = 0; + if (CubDebug(error = PtxVersion(ptx_version))) + { + break; + } + + // Create dispatch functor + DispatchReduce dispatch(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + reduction_op, + init, + stream, + debug_synchronous, + ptx_version); + + // Dispatch to chained policy + if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) + { + break; + } + } while (0); + + return error; + } }; /****************************************************************************** * Segmented dispatch - ******************************************************************************/ + *****************************************************************************/ /** - * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction + * @brief Utility class for dispatching the appropriately-tuned kernels for + * device-wide reduction + * + * @tparam InputIteratorT + * Random-access input iterator type for reading input items \iterator + * + * @tparam OutputIteratorT + * Output iterator type for recording the reduced aggregate \iterator + * + * @tparam BeginOffsetIteratorT + * Random-access input iterator type for reading segment beginning offsets + * \iterator + * + * @tparam EndOffsetIteratorT + * Random-access input iterator type for reading segment ending offsets + * \iterator + * + * @tparam OffsetT + * Signed integer type for global offsets + * + * @tparam ReductionOpT + * Binary reduction functor type having member + * `auto operator()(const T &a, const U &b)` + * + * @tparam InitT + * value type */ template < - typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator - typename BeginOffsetIteratorT, ///< Random-access input iterator type for reading segment beginning offsets \iterator - typename EndOffsetIteratorT, ///< Random-access input iterator type for reading segment ending offsets \iterator - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) - typename OutputT = ///< Data type of the output iterator - cub::detail::non_void_value_t>, - typename SelectedPolicy = DeviceReducePolicy< - cub::detail::value_t, - OutputT, - OffsetT, - ReductionOpT> > -struct DispatchSegmentedReduce : - SelectedPolicy + typename InputIteratorT, + typename OutputIteratorT, + typename BeginOffsetIteratorT, + typename EndOffsetIteratorT, + typename OffsetT, + typename ReductionOpT, + typename InitT = + cub::detail::non_void_value_t< + OutputIteratorT, + cub::detail::value_t>, + typename AccumT = + detail::accumulator_t< + ReductionOpT, + InitT, + cub::detail::value_t>, + typename SelectedPolicy = DeviceReducePolicy> +struct DispatchSegmentedReduce : SelectedPolicy { - //------------------------------------------------------------------------------ - // Problem state - //------------------------------------------------------------------------------ - - void *d_temp_storage; ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in; ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out; ///< [out] Pointer to the output aggregate - OffsetT num_segments; ///< [in] The number of segments that comprise the sorting data - BeginOffsetIteratorT d_begin_offsets; ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - EndOffsetIteratorT d_end_offsets; ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - ReductionOpT reduction_op; ///< [in] Binary reduction functor - OutputT init; ///< [in] The initial value of the reduction - cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int ptx_version; ///< [in] PTX version - - //------------------------------------------------------------------------------ - // Constructor - //------------------------------------------------------------------------------ - - /// Constructor - CUB_RUNTIME_FUNCTION __forceinline__ - DispatchSegmentedReduce( - void* d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - OffsetT num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - ReductionOpT reduction_op, - OutputT init, - cudaStream_t stream, - bool debug_synchronous, - int ptx_version) - : - d_temp_storage(d_temp_storage), - temp_storage_bytes(temp_storage_bytes), - d_in(d_in), - d_out(d_out), - num_segments(num_segments), - d_begin_offsets(d_begin_offsets), - d_end_offsets(d_end_offsets), - reduction_op(reduction_op), - init(init), - stream(stream), - debug_synchronous(debug_synchronous), - ptx_version(ptx_version) - {} - - - - //------------------------------------------------------------------------------ - // Chained policy invocation - //------------------------------------------------------------------------------ - - /// Invocation - template < - typename ActivePolicyT, ///< Umbrella policy active for the target device - typename DeviceSegmentedReduceKernelT> ///< Function type of cub::DeviceSegmentedReduceKernel - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokePasses( - DeviceSegmentedReduceKernelT segmented_reduce_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel - { + //--------------------------------------------------------------------------- + // Problem state + //--------------------------------------------------------------------------- + + /// Device-accessible allocation of temporary storage. When `nullptr`, the + /// required allocation size is written to `temp_storage_bytes` and no work + /// is done. + void *d_temp_storage; + + /// Reference to size in bytes of `d_temp_storage` allocation + size_t &temp_storage_bytes; + + /// Pointer to the input sequence of data items + InputIteratorT d_in; + + /// Pointer to the output aggregate + OutputIteratorT d_out; + + /// The number of segments that comprise the sorting data + OffsetT num_segments; + + /// Random-access input iterator to the sequence of beginning offsets of + /// length `num_segments`, such that `d_begin_offsets[i]` is the first + /// element of the *i*th data segment in `d_keys_*` and + /// `d_values_*` + BeginOffsetIteratorT d_begin_offsets; + + /// Random-access input iterator to the sequence of ending offsets of length + /// `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + /// the *i*th data segment in `d_keys_*` and `d_values_*`. + /// If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is + /// considered empty. + EndOffsetIteratorT d_end_offsets; + + /// Binary reduction functor + ReductionOpT reduction_op; + + /// The initial value of the reduction + InitT init; + + /// CUDA stream to launch kernels within. Default is stream0. + cudaStream_t stream; + + /// Whether or not to synchronize the stream after every kernel launch to + /// check for errors. Also causes launch configurations to be printed to the + /// console. Default is `false`. + bool debug_synchronous; + + int ptx_version; + + //--------------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------------- + + /// Constructor + CUB_RUNTIME_FUNCTION __forceinline__ + DispatchSegmentedReduce(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + OffsetT num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + ReductionOpT reduction_op, + InitT init, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version) + : d_temp_storage(d_temp_storage) + , temp_storage_bytes(temp_storage_bytes) + , d_in(d_in) + , d_out(d_out) + , num_segments(num_segments) + , d_begin_offsets(d_begin_offsets) + , d_end_offsets(d_end_offsets) + , reduction_op(reduction_op) + , init(init) + , stream(stream) + , debug_synchronous(debug_synchronous) + , ptx_version(ptx_version) + {} + + //--------------------------------------------------------------------------- + // Chained policy invocation + //--------------------------------------------------------------------------- + + /** + * @brief Invocation + * + * @tparam ActivePolicyT + * Umbrella policy active for the target device + * + * @tparam DeviceSegmentedReduceKernelT + * Function type of cub::DeviceSegmentedReduceKernel + * + * @param[in] segmented_reduce_kernel + * Kernel function pointer to parameterization of + * cub::DeviceSegmentedReduceKernel + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t + InvokePasses(DeviceSegmentedReduceKernelT segmented_reduce_kernel) + { #ifndef CUB_RUNTIME_ENABLED - (void)segmented_reduce_kernel; - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); + (void)segmented_reduce_kernel; + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); #else - cudaError error = cudaSuccess; - do - { - // Return if the caller is simply requesting the size of the storage allocation - if (d_temp_storage == NULL) - { - temp_storage_bytes = 1; - return cudaSuccess; - } - - // Init kernel configuration - KernelConfig segmented_reduce_config; - if (CubDebug(error = segmented_reduce_config.Init(segmented_reduce_kernel))) break; - - // Log device_reduce_sweep_kernel configuration - if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + cudaError error = cudaSuccess; + + do + { + // Return if the caller is simply requesting the size of the storage + // allocation + if (d_temp_storage == NULL) + { + temp_storage_bytes = 1; + return cudaSuccess; + } + + // Init kernel configuration + KernelConfig segmented_reduce_config; + if (CubDebug( + error = segmented_reduce_config + .Init( + segmented_reduce_kernel))) + { + break; + } + + // Log device_reduce_sweep_kernel configuration + if (debug_synchronous) + { + _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), " + "%d items per thread, %d SM occupancy\n", num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, - (long long) stream, + (long long)stream, ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD, segmented_reduce_config.sm_occupancy); - - // Invoke DeviceReduceKernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( - num_segments, - ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream - ).doit(segmented_reduce_kernel, - d_in, - d_out, - d_begin_offsets, - d_end_offsets, - num_segments, - reduction_op, - init); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + + // Invoke DeviceReduceKernel + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + num_segments, + ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, + 0, + stream) + .doit(segmented_reduce_kernel, + d_in, + d_out, + d_begin_offsets, + d_end_offsets, + num_segments, + reduction_op, + init); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) + { + break; + } + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous) + { + if (CubDebug(error = SyncStream(stream))) + { + break; } - while (0); + } + } while (0); - return error; + return error; #endif // CUB_RUNTIME_ENABLED - - } - - - /// Invocation - template - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t Invoke() + } + + /// Invocation + template + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke() + { + typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; + + // Force kernel code-generation in all compiler passes + return InvokePasses( + DeviceSegmentedReduceKernel); + } + + //--------------------------------------------------------------------------- + // Dispatch entrypoints + //--------------------------------------------------------------------------- + + /** + * @brief Internal dispatch routine for computing a device-wide reduction + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work + * is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_in + * Pointer to the input sequence of data items + * + * @param[out] d_out + * Pointer to the output aggregate + * + * @param[in] num_segments + * The number of segments that comprise the sorting data + * + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and + * `d_values_*` + * + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is + * considered empty. + * + * @param[in] reduction_op + * Binary reduction functor + * + * @param[in] init + * The initial value of the reduction + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. Also causes launch configurations to + * be printed to the console. Default is `false`. + */ + CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t + Dispatch(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + ReductionOpT reduction_op, + InitT init, + cudaStream_t stream, + bool debug_synchronous) + { + typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; + + if (num_segments <= 0) { - typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; - - // Force kernel code-generation in all compiler passes - return InvokePasses( - DeviceSegmentedReduceKernel); + return cudaSuccess; } + cudaError error = cudaSuccess; - //------------------------------------------------------------------------------ - // Dispatch entrypoints - //------------------------------------------------------------------------------ - - /** - * Internal dispatch routine for computing a device-wide reduction - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - ReductionOpT reduction_op, ///< [in] Binary reduction functor - OutputT init, ///< [in] The initial value of the reduction - cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + do { - typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; - - if (num_segments <= 0) - return cudaSuccess; - - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) break; - - // Create dispatch functor - DispatchSegmentedReduce dispatch( - d_temp_storage, temp_storage_bytes, - d_in, d_out, - num_segments, d_begin_offsets, d_end_offsets, - reduction_op, init, - stream, debug_synchronous, ptx_version); - - // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; - } - while (0); - - return error; - } + // Get PTX version + int ptx_version = 0; + if (CubDebug(error = PtxVersion(ptx_version))) + { + break; + } + + // Create dispatch functor + DispatchSegmentedReduce dispatch(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + reduction_op, + init, + stream, + debug_synchronous, + ptx_version); + + // Dispatch to chained policy + if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) + { + break; + } + } while (0); + + return error; + } }; - CUB_NAMESPACE_END - diff --git a/cub/thread/thread_operators.cuh b/cub/thread/thread_operators.cuh index c220ecacfe..b032763023 100644 --- a/cub/thread/thread_operators.cuh +++ b/cub/thread/thread_operators.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -13,10 +13,10 @@ * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND @@ -27,7 +27,7 @@ ******************************************************************************/ /** - * \file + * @file * Simple binary operator functor types */ @@ -37,308 +37,324 @@ #pragma once -#include "../config.cuh" -#include "../util_type.cuh" +#include +#include + +#include +#include CUB_NAMESPACE_BEGIN /** - * \addtogroup UtilModule + * @addtogroup UtilModule * @{ */ -/** - * \brief Default equality functor - */ +/// @brief Default equality functor struct Equality { - /// Boolean equality operator, returns (a == b) - template - __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const - { - return a == b; - } + /// Boolean equality operator, returns `t == u` + template + __host__ __device__ __forceinline__ bool operator()(T &&t, U &&u) const + { + return cuda::std::forward(t) == cuda::std::forward(u); + } }; - -/** - * \brief Default inequality functor - */ +/// @brief Default inequality functor struct Inequality { - /// Boolean inequality operator, returns (a != b) - template - __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const - { - return a != b; - } + /// Boolean inequality operator, returns `t != u` + template + __host__ __device__ __forceinline__ bool operator()(T &&t, U &&u) const + { + return cuda::std::forward(t) != cuda::std::forward(u); + } }; - -/** - * \brief Inequality functor (wraps equality functor) - */ +/// @brief Inequality functor (wraps equality functor) template struct InequalityWrapper { - /// Wrapped equality operator - EqualityOp op; + /// Wrapped equality operator + EqualityOp op; - /// Constructor - __host__ __device__ __forceinline__ - InequalityWrapper(EqualityOp op) : op(op) {} + /// Constructor + __host__ __device__ __forceinline__ InequalityWrapper(EqualityOp op) + : op(op) + {} - /// Boolean inequality operator, returns (a != b) - template - __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) - { - return !op(a, b); - } + /// Boolean inequality operator, returns `t != u` + template + __host__ __device__ __forceinline__ bool operator()(T &&t, U &&u) + { + return !op(std::forward(t), std::forward(u)); + } }; - -/** - * \brief Default sum functor - */ +/// @brief Default sum functor struct Sum { - /// Binary sum operator, returns a + b - template - __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const - { - return a + b; - } + /// Binary sum operator, returns `t + u` + template + __host__ __device__ __forceinline__ auto operator()(T &&t, U &&u) const + -> decltype(cuda::std::forward(t) + cuda::std::forward(u)) + { + return cuda::std::forward(t) + cuda::std::forward(u); + } }; -/** - * \brief Default difference functor - */ +/// @brief Default difference functor struct Difference { - /// Binary difference operator, returns a - b - template - __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + /// Binary difference operator, returns `t - u` + template + __host__ __device__ __forceinline__ auto operator()(T &&t, U &&u) const + -> decltype(cuda::std::forward(t) - cuda::std::forward(u)) { - return a - b; + return cuda::std::forward(t) - cuda::std::forward(u); } }; -/** - * \brief Default division functor - */ +/// @brief Default division functor struct Division { - /// Binary difference operator, returns a - b - template - __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + /// Binary division operator, returns `t / u` + template + __host__ __device__ __forceinline__ auto operator()(T &&t, U &&u) const + -> decltype(cuda::std::forward(t) / cuda::std::forward(u)) { - return a / b; + return cuda::std::forward(t) / cuda::std::forward(u); } }; - -/** - * \brief Default max functor - */ +/// @brief Default max functor struct Max { - /// Boolean max operator, returns (a > b) ? a : b - template - __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const - { - return CUB_MAX(a, b); - } + /// Boolean max operator, returns `(t > u) ? t : u` + template + __host__ __device__ __forceinline__ + typename cuda::std::common_type::type + operator()(T &&t, U &&u) const + { + return CUB_MAX(t, u); + } }; - -/** - * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item) - */ +/// @brief Arg max functor (keeps the value and offset of the first occurrence +/// of the larger item) struct ArgMax { - /// Boolean max operator, preferring the item having the smaller offset in case of ties - template - __host__ __device__ __forceinline__ KeyValuePair operator()( - const KeyValuePair &a, - const KeyValuePair &b) const - { -// Mooch BUG (device reduce argmax gk110 3.2 million random fp32) -// return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a; + /// Boolean max operator, preferring the item having the smaller offset in + /// case of ties + template + __host__ __device__ __forceinline__ KeyValuePair + operator()(const KeyValuePair &a, + const KeyValuePair &b) const + { + // Mooch BUG (device reduce argmax gk110 3.2 million random fp32) + // return ((b.value > a.value) || + // ((a.value == b.value) && (b.key < a.key))) + // ? b : a; - if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) - return b; - return a; + if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) + { + return b; } -}; + return a; + } +}; -/** - * \brief Default min functor - */ +/// @brief Default min functor struct Min { - /// Boolean min operator, returns (a < b) ? a : b - template - __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const - { - return CUB_MIN(a, b); - } + /// Boolean min operator, returns `(t < u) ? t : u` + template + __host__ __device__ __forceinline__ + typename cuda::std::common_type::type + operator()(T &&t, U &&u) const + { + return CUB_MIN(t, u); + } }; - -/** - * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item) - */ +/// @brief Arg min functor (keeps the value and offset of the first occurrence +/// of the smallest item) struct ArgMin { - /// Boolean min operator, preferring the item having the smaller offset in case of ties - template - __host__ __device__ __forceinline__ KeyValuePair operator()( - const KeyValuePair &a, - const KeyValuePair &b) const - { -// Mooch BUG (device reduce argmax gk110 3.2 million random fp32) -// return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a; + /// Boolean min operator, preferring the item having the smaller offset in + /// case of ties + template + __host__ __device__ __forceinline__ KeyValuePair + operator()(const KeyValuePair &a, + const KeyValuePair &b) const + { + // Mooch BUG (device reduce argmax gk110 3.2 million random fp32) + // return ((b.value < a.value) || + // ((a.value == b.value) && (b.key < a.key))) + // ? b : a; - if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) - return b; - return a; + if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) + { + return b; } -}; + return a; + } +}; -/** - * \brief Default cast functor - */ +/// @brief Default cast functor template struct CastOp { - /// Cast operator, returns (B) a - template - __host__ __device__ __forceinline__ B operator()(const A &a) const - { - return (B) a; - } + /// Cast operator, returns `(B) a` + template + __host__ __device__ __forceinline__ B operator()(A &&a) const + { + return (B)a; + } }; - -/** - * \brief Binary operator wrapper for switching non-commutative scan arguments - */ +/// @brief Binary operator wrapper for switching non-commutative scan arguments template class SwizzleScanOp { private: - - /// Wrapped scan operator - ScanOp scan_op; + /// Wrapped scan operator + ScanOp scan_op; public: + /// Constructor + __host__ __device__ __forceinline__ SwizzleScanOp(ScanOp scan_op) + : scan_op(scan_op) + {} - /// Constructor - __host__ __device__ __forceinline__ - SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {} - - /// Switch the scan arguments - template - __host__ __device__ __forceinline__ - T operator()(const T &a, const T &b) - { - T _a(a); - T _b(b); + /// Switch the scan arguments + template + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) + { + T _a(a); + T _b(b); - return scan_op(_b, _a); - } + return scan_op(_b, _a); + } }; - /** - * \brief Reduce-by-segment functor. + * @brief Reduce-by-segment functor. * - * Given two cub::KeyValuePair inputs \p a and \p b and a - * binary associative combining operator \p f(const T &x, const T &y), - * an instance of this functor returns a cub::KeyValuePair whose \p key - * field is a.key + b.key, and whose \p value field - * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise. + * Given two cub::KeyValuePair inputs `a` and `b` and a binary associative + * combining operator `f(const T &x, const T &y)`, an instance of this functor + * returns a cub::KeyValuePair whose `key` field is `a.key + b.key`, and whose + * `value` field is either `b.value` if `b.key` is non-zero, or + * `f(a.value, b.value)` otherwise. * - * ReduceBySegmentOp is an associative, non-commutative binary combining operator - * for input sequences of cub::KeyValuePair pairings. Such - * sequences are typically used to represent a segmented set of values to be reduced + * ReduceBySegmentOp is an associative, non-commutative binary combining + * operator for input sequences of cub::KeyValuePair pairings. Such sequences + * are typically used to represent a segmented set of values to be reduced * and a corresponding set of {0,1}-valued integer "head flags" demarcating the * first value of each segment. * + * @tparam ReductionOpT Binary reduction operator to apply to values */ -template ///< Binary reduction operator to apply to values +template struct ReduceBySegmentOp { - /// Wrapped reduction operator - ReductionOpT op; + /// Wrapped reduction operator + ReductionOpT op; - /// Constructor - __host__ __device__ __forceinline__ ReduceBySegmentOp() {} + /// Constructor + __host__ __device__ __forceinline__ ReduceBySegmentOp() {} - /// Constructor - __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {} + /// Constructor + __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) + : op(op) + {} - /// Scan operator - template ///< KeyValuePair pairing of T (value) and OffsetT (head flag) - __host__ __device__ __forceinline__ KeyValuePairT operator()( - const KeyValuePairT &first, ///< First partial reduction - const KeyValuePairT &second) ///< Second partial reduction - { - KeyValuePairT retval; - retval.key = first.key + second.key; + /** + * @brief Scan operator + * + * @tparam KeyValuePairT + * KeyValuePair pairing of T (value) and OffsetT (head flag) + * + * @param[in] first + * First partial reduction + * + * @param[in] second + * Second partial reduction + */ + template + __host__ __device__ __forceinline__ KeyValuePairT + operator()(const KeyValuePairT &first, const KeyValuePairT &second) + { + KeyValuePairT retval; + retval.key = first.key + second.key; #ifdef _NVHPC_CUDA // WAR bug on nvc++ - if (second.key) - { - retval.value = second.value; - } - else - { - // If second.value isn't copied into a temporary here, nvc++ will - // crash while compiling the TestScanByKeyWithLargeTypes test in - // thrust/testing/scan_by_key.cu: - auto v2 = second.value; - retval.value = op(first.value, v2); - } + if (second.key) + { + retval.value = second.value; + } + else + { + // If second.value isn't copied into a temporary here, nvc++ will + // crash while compiling the TestScanByKeyWithLargeTypes test in + // thrust/testing/scan_by_key.cu: + auto v2 = second.value; + retval.value = op(first.value, v2); + } #else // not nvc++: - retval.value = (second.key) ? - second.value : // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate - op(first.value, second.value); // The second partial reduction does not span a reset, so accumulate both into the running aggregate + // if (second.key) { + // The second partial reduction spans a segment reset, so it's value + // aggregate becomes the running aggregate + // else { + // The second partial reduction does not span a reset, so accumulate both + // into the running aggregate + // } + retval.value = (second.key) ? second.value : op(first.value, second.value); #endif - return retval; - } + return retval; + } }; - -template ///< Binary reduction operator to apply to values +/** + * @tparam ReductionOpT Binary reduction operator to apply to values + */ +template struct ReduceByKeyOp { - /// Wrapped reduction operator - ReductionOpT op; - - /// Constructor - __host__ __device__ __forceinline__ ReduceByKeyOp() {} + /// Wrapped reduction operator + ReductionOpT op; - /// Constructor - __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {} + /// Constructor + __host__ __device__ __forceinline__ ReduceByKeyOp() {} - /// Scan operator - template - __host__ __device__ __forceinline__ KeyValuePairT operator()( - const KeyValuePairT &first, ///< First partial reduction - const KeyValuePairT &second) ///< Second partial reduction - { - KeyValuePairT retval = second; + /// Constructor + __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) + : op(op) + {} - if (first.key == second.key) - retval.value = op(first.value, retval.value); + /** + * @brief Scan operator + * + * @param[in] first First partial reduction + * @param[in] second Second partial reduction + */ + template + __host__ __device__ __forceinline__ KeyValuePairT + operator()(const KeyValuePairT &first, const KeyValuePairT &second) + { + KeyValuePairT retval = second; - return retval; + if (first.key == second.key) + { + retval.value = op(first.value, retval.value); } -}; + return retval; + } +}; template struct BinaryFlip diff --git a/cub/thread/thread_reduce.cuh b/cub/thread/thread_reduce.cuh index 98fb2faab3..9e4fb79d45 100644 --- a/cub/thread/thread_reduce.cuh +++ b/cub/thread/thread_reduce.cuh @@ -34,6 +34,7 @@ #pragma once #include "../thread/thread_operators.cuh" +#include "../detail/type_traits.cuh" #include "../config.cuh" CUB_NAMESPACE_BEGIN @@ -47,14 +48,16 @@ namespace internal { template < int LENGTH, typename T, - typename ReductionOp> -__device__ __forceinline__ T ThreadReduce( + typename ReductionOp, + typename PrefixT, + typename AccumT = detail::accumulator_t> +__device__ __forceinline__ AccumT ThreadReduce( T* input, ///< [in] Input array ReductionOp reduction_op, ///< [in] Binary reduction operator - T prefix, ///< [in] Prefix to seed reduction with + PrefixT prefix, ///< [in] Prefix to seed reduction with Int2Type /*length*/) { - T retval = prefix; + AccumT retval = prefix; #pragma unroll for (int i = 0; i < LENGTH; ++i) @@ -74,11 +77,13 @@ __device__ __forceinline__ T ThreadReduce( template < int LENGTH, typename T, - typename ReductionOp> -__device__ __forceinline__ T ThreadReduce( + typename ReductionOp, + typename PrefixT, + typename AccumT = detail::accumulator_t> +__device__ __forceinline__ AccumT ThreadReduce( T* input, ///< [in] Input array ReductionOp reduction_op, ///< [in] Binary reduction operator - T prefix) ///< [in] Prefix to seed reduction with + PrefixT prefix) ///< [in] Prefix to seed reduction with { return ThreadReduce(input, reduction_op, prefix, Int2Type()); } @@ -114,11 +119,13 @@ __device__ __forceinline__ T ThreadReduce( template < int LENGTH, typename T, - typename ReductionOp> -__device__ __forceinline__ T ThreadReduce( + typename ReductionOp, + typename PrefixT, + typename AccumT = detail::accumulator_t> +__device__ __forceinline__ AccumT ThreadReduce( T (&input)[LENGTH], ///< [in] Input array ReductionOp reduction_op, ///< [in] Binary reduction operator - T prefix) ///< [in] Prefix to seed reduction with + PrefixT prefix) ///< [in] Prefix to seed reduction with { return ThreadReduce(input, reduction_op, prefix, Int2Type()); } diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu index 0599add52c..a8158b35a8 100644 --- a/test/test_device_reduce.cu +++ b/test/test_device_reduce.cu @@ -78,8 +78,9 @@ enum Backend struct CustomMax { /// Boolean max operator, returns (a > b) ? a : b - template - __host__ __device__ __forceinline__ OutputT operator()(const OutputT &a, const OutputT &b) + template + __host__ __device__ auto operator()(T&& a, C&& b) + -> cub::detail::accumulator_t { return CUB_MAX(a, b); } @@ -636,7 +637,9 @@ void Initialize( template struct Solution { - typedef _OutputT OutputT; + using OutputT = _OutputT; + using InitT = OutputT; + using AccumT = cub::detail::accumulator_t; template static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, @@ -644,7 +647,7 @@ struct Solution { for (int i = 0; i < num_segments; ++i) { - OutputT aggregate = Traits::Lowest(); // replace with std::numeric_limits::lowest() when C++ support is more prevalent + AccumT aggregate = Traits::Lowest(); // replace with std::numeric_limits::lowest() when C++ support is more prevalent for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j) aggregate = reduction_op(aggregate, OutputT(h_in[j])); h_reference[i] = aggregate; @@ -656,7 +659,9 @@ struct Solution template struct Solution { - typedef _OutputT OutputT; + using OutputT = _OutputT; + using InitT = OutputT; + using AccumT = cub::detail::accumulator_t; template static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, @@ -664,7 +669,7 @@ struct Solution { for (int i = 0; i < num_segments; ++i) { - OutputT aggregate = Traits::Max(); // replace with std::numeric_limits::max() when C++ support is more prevalent + AccumT aggregate = Traits::Max(); // replace with std::numeric_limits::max() when C++ support is more prevalent for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j) aggregate = reduction_op(aggregate, OutputT(h_in[j])); h_reference[i] = aggregate; @@ -677,7 +682,9 @@ struct Solution template struct Solution { - typedef _OutputT OutputT; + using OutputT = _OutputT; + using InitT = OutputT; + using AccumT = cub::detail::accumulator_t; template static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, @@ -685,10 +692,10 @@ struct Solution { for (int i = 0; i < num_segments; ++i) { - OutputT aggregate; + AccumT aggregate; InitValue(INTEGER_SEED, aggregate, 0); for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j) - aggregate = reduction_op(aggregate, OutputT(h_in[j])); + aggregate = reduction_op(aggregate, h_in[j]); h_reference[i] = aggregate; } } @@ -795,6 +802,7 @@ void Test( // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_segments, g_verbose, g_verbose); + printf("\t%s", compare ? "FAIL" : "PASS"); // Flush any stdout/stderr @@ -1182,13 +1190,173 @@ void TestType( OffsetT max_items, OffsetT max_segments) { - typedef typename DeviceReducePolicy::MaxPolicy MaxPolicyT; + typedef typename DeviceReducePolicy::MaxPolicy MaxPolicyT; TestBySize dispatch(max_items, max_segments); MaxPolicyT::Invoke(g_ptx_version, dispatch); } +class CustomInputT +{ + char m_val{}; + +public: + __host__ __device__ explicit CustomInputT(char val) + : m_val(val) + {} + + __host__ __device__ int get() const { return static_cast(m_val); } +}; + +class CustomAccumulatorT +{ + int m_val{0}; + int m_magic_value{42}; + + __host__ __device__ CustomAccumulatorT(int val) + : m_val(val) + {} + +public: + __host__ __device__ CustomAccumulatorT() + {} + + __host__ __device__ CustomAccumulatorT(const CustomAccumulatorT &in) + : m_val(in.is_valid() * in.get()) + , m_magic_value(in.is_valid() * 42) + {} + + __host__ __device__ void operator=(const CustomInputT &in) + { + if (this->is_valid()) + { + m_val = in.get(); + } + } + + __host__ __device__ void operator=(const CustomAccumulatorT &in) + { + if (this->is_valid() && in.is_valid()) + { + m_val = in.get(); + } + } + + __host__ __device__ CustomAccumulatorT + operator+(const CustomInputT &in) const + { + const int multiplier = this->is_valid(); + return {(m_val + in.get()) * multiplier}; + } + + __host__ __device__ CustomAccumulatorT + operator+(const CustomAccumulatorT &in) const + { + const int multiplier = this->is_valid() && in.is_valid(); + return {(m_val + in.get()) * multiplier}; + } + + __host__ __device__ int get() const { return m_val; } + + __host__ __device__ bool is_valid() const { return m_magic_value == 42; } +}; + +class CustomOutputT +{ + bool *m_d_flag{}; + int m_expected{}; + +public: + __host__ __device__ CustomOutputT(bool *d_flag, int expected) + : m_d_flag(d_flag) + , m_expected(expected) + {} + + __host__ __device__ void operator=(const CustomAccumulatorT &accum) const + { + *m_d_flag = accum.is_valid() && (accum.get() == m_expected); + } +}; + +__global__ void InitializeTestAccumulatorTypes(int num_items, + int expected, + bool *d_flag, + CustomInputT *d_in, + CustomOutputT *d_out) +{ + const int idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + + if (idx < num_items) + { + d_in[idx] = CustomInputT(1); + } + + if (idx == 0) + { + *d_out = CustomOutputT{d_flag, expected}; + } +} + +void TestAccumulatorTypes() +{ + const int num_items = 2 * 1024 * 1024; + const int expected = num_items; + const int block_size = 256; + const int grid_size = (num_items + block_size - 1) / block_size; + + CustomInputT *d_in{}; + CustomOutputT *d_out{}; + CustomAccumulatorT init{}; + bool *d_flag{}; + + CubDebugExit( + g_allocator.DeviceAllocate((void **)&d_out, sizeof(CustomOutputT))); + CubDebugExit(g_allocator.DeviceAllocate((void **)&d_flag, sizeof(bool))); + CubDebugExit(g_allocator.DeviceAllocate((void **)&d_in, + sizeof(CustomInputT) * num_items)); + + InitializeTestAccumulatorTypes<<>>(num_items, + expected, + d_flag, + d_in, + d_out); + + std::uint8_t *d_temp_storage{}; + std::size_t temp_storage_bytes{}; + + CubDebugExit(cub::DeviceReduce::Reduce(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + cub::Sum{}, + init, + 0, + true)); + + CubDebugExit( + g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes)); + CubDebugExit(cudaMemset(d_temp_storage, 1, temp_storage_bytes)); + + CubDebugExit(cub::DeviceReduce::Reduce(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + cub::Sum{}, + init, + 0, + true)); + + bool ok{}; + CubDebugExit(cudaMemcpy(&ok, d_flag, sizeof(bool), cudaMemcpyDeviceToHost)); + + AssertTrue(ok); + + CubDebugExit(g_allocator.DeviceFree(d_out)); + CubDebugExit(g_allocator.DeviceFree(d_in)); +} //--------------------------------------------------------------------- // Main @@ -1256,6 +1424,8 @@ int main(int argc, char** argv) #else // TEST_TYPES == 3 TestType(max_items, max_segments); TestType(max_items, max_segments); + + TestAccumulatorTypes(); #endif printf("\n"); return 0; diff --git a/test/test_thread_operators.cu b/test/test_thread_operators.cu new file mode 100644 index 0000000000..44cc3a81d6 --- /dev/null +++ b/test/test_thread_operators.cu @@ -0,0 +1,259 @@ +/******************************************************************************* + * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#include "test_util.h" + +#include + +template +T Make(int val) +{ + return T{val}; +} + +template +class BaseT +{ +protected: + int m_val{}; + +public: + BaseT(int val) + : m_val{val} + {} +}; + +template <> +class BaseT +{ +protected: + int m_val{}; + +public: + BaseT(int val) + : m_val{val} + {} + + __host__ __device__ operator int() const { return m_val; } +}; + +#define CUSTOM_TYPE_FACTORY(NAME, RT, OP, CONVERTABLE) \ + class Custom##NAME##T : public BaseT \ + { \ + explicit Custom##NAME##T(int val) \ + : BaseT(val) \ + {} \ + \ + friend Custom##NAME##T Make(int); \ + \ + public: \ + __host__ __device__ RT operator OP(int val) const \ + { \ + return m_val OP val; \ + } \ + } + +// NAME RT OP CONVERTABLE +CUSTOM_TYPE_FACTORY(Eq, bool, ==, false); +CUSTOM_TYPE_FACTORY(Ineq, bool, !=, false); +CUSTOM_TYPE_FACTORY(Sum, int, +, false); +CUSTOM_TYPE_FACTORY(Diff, int, -, false); +CUSTOM_TYPE_FACTORY(Div, int, /, false); +CUSTOM_TYPE_FACTORY(Gt, bool, >, true); +CUSTOM_TYPE_FACTORY(Lt, bool, <, true); + +void TestEquality() +{ + cub::Equality op{}; + + const int const_magic_val = 42; + int magic_val = const_magic_val; + + AssertEquals(op(const_magic_val, const_magic_val), true); + AssertEquals(op(const_magic_val, magic_val), true); + AssertEquals(op(const_magic_val, magic_val + 1), false); + + AssertEquals(op(Make(magic_val), magic_val), true); + AssertEquals(op(Make(magic_val), magic_val + 1), false); +} + +void TestInequality() +{ + cub::Inequality op{}; + + const int const_magic_val = 42; + int magic_val = const_magic_val; + + AssertEquals(op(const_magic_val, const_magic_val), false); + AssertEquals(op(const_magic_val, magic_val), false); + AssertEquals(op(const_magic_val, magic_val + 1), true); + + AssertEquals(op(Make(magic_val), magic_val), false); + AssertEquals(op(Make(magic_val), magic_val + 1), true); +} + +void TestInequalityWrapper() +{ + cub::Equality wrapped_op{}; + cub::InequalityWrapper op{wrapped_op}; + + const int const_magic_val = 42; + int magic_val = const_magic_val; + + AssertEquals(op(const_magic_val, const_magic_val), false); + AssertEquals(op(const_magic_val, magic_val), false); + AssertEquals(op(const_magic_val, magic_val + 1), true); + + AssertEquals(op(Make(magic_val), magic_val), false); + AssertEquals(op(Make(magic_val), magic_val + 1), true); +} + +#define CUSTOM_SYNC_T(NAME, RT, OP) \ + struct Custom ## NAME ## Sink \ + { \ + template \ + __host__ __device__ RT operator OP (T &&) const \ + { \ + return RT{}; \ + } \ + } + +CUSTOM_SYNC_T(SumInt, int, +); +CUSTOM_SYNC_T(SumCustomInt, CustomSumIntSink, +); + +CUSTOM_SYNC_T(DiffInt, int, -); +CUSTOM_SYNC_T(DiffCustomInt, CustomDiffIntSink, -); + +CUSTOM_SYNC_T(DivInt, int, /); +CUSTOM_SYNC_T(DivCustomInt, CustomDivIntSink, /); + +template +void StaticSame() +{ + static_assert(std::is_same::value, "shall match"); +} + +void TestSum() +{ + cub::Sum op{}; + + const int const_magic_val = 40; + int magic_val = const_magic_val; + + AssertEquals(op(const_magic_val, 2), 42); + AssertEquals(op(magic_val, 2), 42); + AssertEquals(op(Make(magic_val), 2), 42); + + StaticSame(); + StaticSame(); + StaticSame(); + StaticSame(); +} + +void TestDifference() +{ + cub::Difference op{}; + + const int const_magic_val = 44; + int magic_val = const_magic_val; + + AssertEquals(op(const_magic_val, 2), 42); + AssertEquals(op(magic_val, 2), 42); + + AssertEquals(op(Make(magic_val), 2), 42); + + StaticSame(); + StaticSame(); + StaticSame(); + StaticSame(); +} + +void TestDivision() +{ + cub::Division op{}; + + const int const_magic_val = 44; + int magic_val = const_magic_val; + + AssertEquals(op(const_magic_val, 2), 22); + AssertEquals(op(magic_val, 2), 22); + + AssertEquals(op(Make(magic_val), 2), 22); + + StaticSame(); + StaticSame(); + StaticSame(); + StaticSame(); +} + +void TestMax() +{ + cub::Max op{}; + + const int const_magic_val = 42; + int magic_val = const_magic_val; + + AssertEquals(op(const_magic_val, 2), 42); + AssertEquals(op(magic_val, 2), 42); + + AssertEquals(op(2, Make(magic_val)), 42); + + StaticSame(); + StaticSame(); + StaticSame(magic_val))), int>(); +} + +void TestMin() +{ + cub::Min op{}; + + const int const_magic_val = 42; + int magic_val = const_magic_val; + + AssertEquals(op(const_magic_val, 2), 2); + AssertEquals(op(magic_val, 2), 2); + + AssertEquals(op(2, Make(magic_val)), 2); + + StaticSame(); + StaticSame(); + StaticSame(magic_val))), int>(); +} + +int main() +{ + TestEquality(); + TestInequality(); + TestInequalityWrapper(); + TestSum(); + TestDifference(); + TestDivision(); + TestMax(); + TestMin(); + + return 0; +}