From a064b129673e1f68c511540934dd9d70c6b16683 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Sat, 4 Jun 2022 12:41:23 +0400
Subject: [PATCH] P2322R6 accumulator types for reduce

---
 cub/agent/agent_reduce.cuh                    |  661 +++---
 .../block_reduce_warp_reductions.cuh          |    2 +-
 cub/detail/type_traits.cuh                    |   14 +-
 cub/device/device_reduce.cuh                  |   32 +-
 cub/device/dispatch/dispatch_reduce.cuh       | 1838 ++++++++++-------
 cub/thread/thread_operators.cuh               |  434 ++--
 cub/thread/thread_reduce.cuh                  |   27 +-
 test/test_device_reduce.cu                    |  190 +-
 test/test_thread_operators.cu                 |  259 +++
 9 files changed, 2221 insertions(+), 1236 deletions(-)
 create mode 100644 test/test_thread_operators.cu
diff --git a/cub/agent/agent_reduce.cuh b/cub/agent/agent_reduce.cuh
index 73a2ba492a..aa18efa07e 100644
--- a/cub/agent/agent_reduce.cuh
+++ b/cub/agent/agent_reduce.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -13,10 +13,10 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
@@ -27,357 +27,418 @@
  ******************************************************************************/
 
 /**
- * \file
- * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ * @file cub::AgentReduce implements a stateful abstraction of CUDA thread 
+ *       blocks for participating in device-wide reduction.
  */
 
 #pragma once
 
 #include <iterator>
 
-#include "../block/block_load.cuh"
-#include "../block/block_reduce.cuh"
-#include "../grid/grid_mapping.cuh"
-#include "../grid/grid_even_share.cuh"
-#include "../config.cuh"
-#include "../util_type.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_reduce.cuh>
+#include <cub/config.cuh>
+#include <cub/detail/type_traits.cuh>
+#include <cub/grid/grid_even_share.cuh>
+#include <cub/grid/grid_mapping.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/util_type.cuh>
 
 CUB_NAMESPACE_BEGIN
 
-
 /******************************************************************************
  * Tuning policy types
  ******************************************************************************/
 
 /**
  * Parameterizable tuning policy type for AgentReduce
+ * @tparam NOMINAL_BLOCK_THREADS_4B Threads per thread block
+ * @tparam NOMINAL_ITEMS_PER_THREAD_4B Items per thread (per tile of input)
+ * @tparam ComputeT Dominant compute type
+ * @tparam _VECTOR_LOAD_LENGTH Number of items per vectorized load
+ * @tparam _BLOCK_ALGORITHM Cooperative block-wide reduction algorithm to use
+ * @tparam _LOAD_MODIFIER Cache load modifier for reading input elements
  */
-template <
-    int                     NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
-    int                     NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
-    typename                ComputeT,                       ///< Dominant compute type
-    int                     _VECTOR_LOAD_LENGTH,            ///< Number of items per vectorized load
-    BlockReduceAlgorithm    _BLOCK_ALGORITHM,               ///< Cooperative block-wide reduction algorithm to use
-    CacheLoadModifier       _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    typename                ScalingType =  MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
-struct AgentReducePolicy :
-    ScalingType
+template <int NOMINAL_BLOCK_THREADS_4B,
+          int NOMINAL_ITEMS_PER_THREAD_4B,
+          typename ComputeT,
+          int _VECTOR_LOAD_LENGTH,
+          BlockReduceAlgorithm _BLOCK_ALGORITHM,
+          CacheLoadModifier _LOAD_MODIFIER,
+          typename ScalingType = MemBoundScaling<NOMINAL_BLOCK_THREADS_4B,
+                                                 NOMINAL_ITEMS_PER_THREAD_4B,
+                                                 ComputeT>>
+struct AgentReducePolicy : ScalingType
 {
-    enum
-    {
-        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,  ///< Number of items per vectorized load
-    };
-
-    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;     ///< Cooperative block-wide reduction algorithm to use
-    static const CacheLoadModifier     LOAD_MODIFIER        = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-};
+  /// Number of items per vectorized load
+  static constexpr int VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH;
 
+  /// Cooperative block-wide reduction algorithm to use
+  static constexpr BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM;
 
+  /// Cache load modifier for reading input elements
+  static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
+};
 
 /******************************************************************************
  * Thread block abstractions
  ******************************************************************************/
 
 /**
- * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ * @brief AgentReduce implements a stateful abstraction of CUDA thread blocks
+ *        for participating in device-wide reduction .
+ *
+ * Each thread reduces only the values it loads. If `FIRST_TILE`, this partial
+ * reduction is stored into `thread_aggregate`. Otherwise it is accumulated
+ * into `thread_aggregate`.
+ *
+ * @tparam AgentReducePolicy
+ *   Parameterized AgentReducePolicy tuning policy type
+ *
+ * @tparam InputIteratorT
+ *   Random-access iterator type for input
+ *
+ * @tparam OutputIteratorT
+ *   Random-access iterator type for output
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
  *
- * Each thread reduces only the values it loads. If \p FIRST_TILE, this
- * partial reduction is stored into \p thread_aggregate.  Otherwise it is
- * accumulated into \p thread_aggregate.
+ * @tparam ReductionOp
+ *   Binary reduction operator type having member
+ *   `auto operator()(T &&a, U &&b)`
+ *
+ * @tparam AccumT
+ *   The type of intermediate accumulator (according to P2322R6)
  */
-template <
-    typename AgentReducePolicy,        ///< Parameterized AgentReducePolicy tuning policy type
-    typename InputIteratorT,           ///< Random-access iterator type for input
-    typename OutputIteratorT,          ///< Random-access iterator type for output
-    typename OffsetT,                  ///< Signed integer type for global offsets
-    typename ReductionOp>              ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+template <typename AgentReducePolicy,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename OffsetT,
+          typename ReductionOp,
+          typename AccumT>
 struct AgentReduce
 {
-
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// The input value type
-    using InputT = cub::detail::value_t<InputIteratorT>;
-
-    /// The output value type
-    using OutputT = cub::detail::non_void_value_t<OutputIteratorT, InputT>;
-
-    /// Vector type of InputT for data movement
-    using VectorT =
-      typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type;
-
-    /// Input iterator wrapper type (for applying cache modifier)
-    // Wrap the native input pointer with CacheModifiedInputIterator
-    // or directly use the supplied input iterator type
-    using WrappedInputIteratorT = cub::detail::conditional_t<
-      std::is_pointer<InputIteratorT>::value,
-      CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>,
-      InputIteratorT>;
-
-    /// Constants
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+
+  /// The input value type
+  using InputT = cub::detail::value_t<InputIteratorT>;
+
+  /// Vector type of InputT for data movement
+  using VectorT =
+    typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type;
+
+  /// Input iterator wrapper type (for applying cache modifier)
+  // Wrap the native input pointer with CacheModifiedInputIterator
+  // or directly use the supplied input iterator type
+  using WrappedInputIteratorT = cub::detail::conditional_t<
+    std::is_pointer<InputIteratorT>::value,
+    CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>,
+    InputIteratorT>;
+
+  /// Constants
+  static constexpr int BLOCK_THREADS      = AgentReducePolicy::BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD   = AgentReducePolicy::ITEMS_PER_THREAD;
+  static constexpr int TILE_ITEMS         = BLOCK_THREADS * ITEMS_PER_THREAD;
+  static constexpr int VECTOR_LOAD_LENGTH =
+    CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH);
+
+  // Can vectorize according to the policy if the input iterator is a native
+  // pointer to a primitive type
+  static constexpr bool ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) &&
+                            (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
+                            (std::is_pointer<InputIteratorT>::value) &&
+                            Traits<InputT>::PRIMITIVE;
+
+  static constexpr CacheLoadModifier LOAD_MODIFIER =
+    AgentReducePolicy::LOAD_MODIFIER;
+
+  static constexpr BlockReduceAlgorithm BLOCK_ALGORITHM =
+    AgentReducePolicy::BLOCK_ALGORITHM;
+
+  /// Parameterized BlockReduce primitive
+  using BlockReduceT =
+    BlockReduce<AccumT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM>;
+
+  /// Shared memory type required by this thread block
+  struct _TempStorage
+  {
+    typename BlockReduceT::TempStorage reduce;
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //---------------------------------------------------------------------
+  // Per-thread fields
+  //---------------------------------------------------------------------
+
+  _TempStorage &temp_storage;         ///< Reference to temp_storage
+  InputIteratorT d_in;                ///< Input data to reduce
+  WrappedInputIteratorT d_wrapped_in; ///< Wrapped input data to reduce
+  ReductionOp reduction_op;           ///< Binary reduction operator
+
+  //---------------------------------------------------------------------
+  // Utility
+  //---------------------------------------------------------------------
+
+  // Whether or not the input is aligned with the vector type (specialized for
+  // types we can vectorize)
+  template <typename Iterator>
+  static __device__ __forceinline__ bool
+  IsAligned(Iterator d_in, Int2Type<true> /*can_vectorize*/)
+  {
+    return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
+  }
+
+  // Whether or not the input is aligned with the vector type (specialized for
+  // types we cannot vectorize)
+  template <typename Iterator>
+  static __device__ __forceinline__ bool
+  IsAligned(Iterator /*d_in*/, Int2Type<false> /*can_vectorize*/)
+  {
+    return false;
+  }
+
+  //---------------------------------------------------------------------
+  // Constructor
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Constructor
+   * @param temp_storage Reference to temp_storage
+   * @param d_in Input data to reduce
+   * @param reduction_op Binary reduction operator
+   */
+  __device__ __forceinline__ AgentReduce(TempStorage &temp_storage,
+                                         InputIteratorT d_in,
+                                         ReductionOp reduction_op)
+      : temp_storage(temp_storage.Alias())
+      , d_in(d_in)
+      , d_wrapped_in(d_in)
+      , reduction_op(reduction_op)
+  {}
+
+  //---------------------------------------------------------------------
+  // Tile consumption
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Consume a full tile of input (non-vectorized)
+   * @param block_offset The offset the tile to consume
+   * @param valid_items The number of valid items in the tile
+   * @param is_full_tile Whether or not this is a full tile
+   * @param can_vectorize Whether or not we can vectorize loads
+   */
+  template <int IS_FIRST_TILE>
+  __device__ __forceinline__ void ConsumeTile(AccumT &thread_aggregate,
+                                              OffsetT block_offset,
+                                              int /*valid_items*/,
+                                              Int2Type<true> /*is_full_tile*/,
+                                              Int2Type<false> /*can_vectorize*/)
+  {
+    AccumT items[ITEMS_PER_THREAD];
+
+    // Load items in striped fashion
+    LoadDirectStriped<BLOCK_THREADS>(threadIdx.x,
+                                     d_wrapped_in + block_offset,
+                                     items);
+
+    // Reduce items within each thread stripe
+    thread_aggregate =
+      (IS_FIRST_TILE)
+        ? internal::ThreadReduce(items, reduction_op)
+        : internal::ThreadReduce(items, reduction_op, thread_aggregate);
+  }
+
+  /**
+   * Consume a full tile of input (vectorized)
+   * @param block_offset The offset the tile to consume
+   * @param valid_items The number of valid items in the tile
+   * @param is_full_tile Whether or not this is a full tile
+   * @param can_vectorize Whether or not we can vectorize loads
+   */
+  template <int IS_FIRST_TILE>
+  __device__ __forceinline__ void ConsumeTile(AccumT &thread_aggregate,
+                                              OffsetT block_offset,
+                                              int /*valid_items*/,
+                                              Int2Type<true> /*is_full_tile*/,
+                                              Int2Type<true> /*can_vectorize*/)
+  {
+    // Alias items as an array of VectorT and load it in striped fashion
     enum
     {
-        BLOCK_THREADS       = AgentReducePolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = AgentReducePolicy::ITEMS_PER_THREAD,
-        VECTOR_LOAD_LENGTH  = CUB_MIN(int(ITEMS_PER_THREAD), int(AgentReducePolicy::VECTOR_LOAD_LENGTH)),
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
-        ATTEMPT_VECTORIZATION   = (VECTOR_LOAD_LENGTH > 1) &&
-                                    (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
-                                    (std::is_pointer<InputIteratorT>::value) && Traits<InputT>::PRIMITIVE,
-
+      WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH
     };
 
-    static const CacheLoadModifier    LOAD_MODIFIER   = AgentReducePolicy::LOAD_MODIFIER;
-    static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;
-
-    /// Parameterized BlockReduce primitive
-    using BlockReduceT =
-      BlockReduce<OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM>;
-
-    /// Shared memory type required by this thread block
-    struct _TempStorage
+    // Fabricate a vectorized input iterator
+    InputT *d_in_unqualified = const_cast<InputT *>(d_in) + block_offset +
+                               (threadIdx.x * VECTOR_LOAD_LENGTH);
+    CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, VectorT, OffsetT>
+      d_vec_in(reinterpret_cast<VectorT *>(d_in_unqualified));
+
+    // Load items as vector items
+    InputT input_items[ITEMS_PER_THREAD];
+    VectorT *vec_items = reinterpret_cast<VectorT *>(input_items);
+#pragma unroll
+    for (int i = 0; i < WORDS; ++i)
     {
-        typename BlockReduceT::TempStorage  reduce;
-    };
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage&           temp_storage;       ///< Reference to temp_storage
-    InputIteratorT          d_in;               ///< Input data to reduce
-    WrappedInputIteratorT   d_wrapped_in;       ///< Wrapped input data to reduce
-    ReductionOp             reduction_op;       ///< Binary reduction operator
-
-
-    //---------------------------------------------------------------------
-    // Utility
-    //---------------------------------------------------------------------
-
-
-    // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
-    template <typename Iterator>
-    static __device__ __forceinline__ bool IsAligned(
-        Iterator        d_in,
-        Int2Type<true>  /*can_vectorize*/)
-    {
-        return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
+      vec_items[i] = d_vec_in[BLOCK_THREADS * i];
     }
 
-    // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
-    template <typename Iterator>
-    static __device__ __forceinline__ bool IsAligned(
-        Iterator        /*d_in*/,
-        Int2Type<false> /*can_vectorize*/)
+    // Convert from input type to output type
+    AccumT items[ITEMS_PER_THREAD];
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; ++i)
     {
-        return false;
+      items[i] = input_items[i];
     }
 
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ AgentReduce(
-        TempStorage&            temp_storage,       ///< Reference to temp_storage
-        InputIteratorT          d_in,               ///< Input data to reduce
-        ReductionOp             reduction_op)       ///< Binary reduction operator
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_wrapped_in(d_in),
-        reduction_op(reduction_op)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Tile consumption
-    //---------------------------------------------------------------------
-
-    /**
-     * Consume a full tile of input (non-vectorized)
-     */
-    template <int IS_FIRST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        OutputT                 &thread_aggregate,
-        OffsetT                 block_offset,       ///< The offset the tile to consume
-        int                     /*valid_items*/,    ///< The number of valid items in the tile
-        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
-        Int2Type<false>         /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    // Reduce items within each thread stripe
+    thread_aggregate =
+      (IS_FIRST_TILE)
+        ? internal::ThreadReduce(items, reduction_op)
+        : internal::ThreadReduce(items, reduction_op, thread_aggregate);
+  }
+
+  /**
+   * Consume a partial tile of input
+   * @param block_offset The offset the tile to consume
+   * @param valid_items The number of valid items in the tile
+   * @param is_full_tile Whether or not this is a full tile
+   * @param can_vectorize Whether or not we can vectorize loads
+   */
+  template <int IS_FIRST_TILE, int CAN_VECTORIZE>
+  __device__ __forceinline__ void
+  ConsumeTile(AccumT &thread_aggregate,
+              OffsetT block_offset,
+              int valid_items,
+              Int2Type<false> /*is_full_tile*/,
+              Int2Type<CAN_VECTORIZE> /*can_vectorize*/)
+  {
+    // Partial tile
+    int thread_offset = threadIdx.x;
+
+    // Read first item
+    if ((IS_FIRST_TILE) && (thread_offset < valid_items))
     {
-        OutputT items[ITEMS_PER_THREAD];
-
-        // Load items in striped fashion
-        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
-
-        // Reduce items within each thread stripe
-        thread_aggregate = (IS_FIRST_TILE) ?
-            internal::ThreadReduce(items, reduction_op) :
-            internal::ThreadReduce(items, reduction_op, thread_aggregate);
+      thread_aggregate = d_wrapped_in[block_offset + thread_offset];
+      thread_offset += BLOCK_THREADS;
     }
 
-
-    /**
-     * Consume a full tile of input (vectorized)
-     */
-    template <int IS_FIRST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        OutputT                 &thread_aggregate,
-        OffsetT                 block_offset,       ///< The offset the tile to consume
-        int                     /*valid_items*/,    ///< The number of valid items in the tile
-        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
-        Int2Type<true>          /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    // Continue reading items (block-striped)
+    while (thread_offset < valid_items)
     {
-        // Alias items as an array of VectorT and load it in striped fashion
-        enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
-
-        // Fabricate a vectorized input iterator
-        InputT *d_in_unqualified = const_cast<InputT*>(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
-        CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, VectorT, OffsetT> d_vec_in(
-            reinterpret_cast<VectorT*>(d_in_unqualified));
-
-        // Load items as vector items
-        InputT input_items[ITEMS_PER_THREAD];
-        VectorT *vec_items = reinterpret_cast<VectorT*>(input_items);
-        #pragma unroll
-        for (int i = 0; i < WORDS; ++i)
-            vec_items[i] = d_vec_in[BLOCK_THREADS * i];
-
-        // Convert from input type to output type
-        OutputT items[ITEMS_PER_THREAD];
-        #pragma unroll
-        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
-            items[i] = input_items[i];
-
-        // Reduce items within each thread stripe
-        thread_aggregate = (IS_FIRST_TILE) ?
-            internal::ThreadReduce(items, reduction_op) :
-            internal::ThreadReduce(items, reduction_op, thread_aggregate);
-    }
-
+      InputT item(d_wrapped_in[block_offset + thread_offset]);
 
-    /**
-     * Consume a partial tile of input
-     */
-    template <int IS_FIRST_TILE, int CAN_VECTORIZE>
-    __device__ __forceinline__ void ConsumeTile(
-        OutputT                 &thread_aggregate,
-        OffsetT                 block_offset,       ///< The offset the tile to consume
-        int                     valid_items,        ///< The number of valid items in the tile
-        Int2Type<false>         /*is_full_tile*/,   ///< Whether or not this is a full tile
-        Int2Type<CAN_VECTORIZE> /*can_vectorize*/)  ///< Whether or not we can vectorize loads
-    {
-        // Partial tile
-        int thread_offset = threadIdx.x;
-
-        // Read first item
-        if ((IS_FIRST_TILE) && (thread_offset < valid_items))
-        {
-            thread_aggregate = d_wrapped_in[block_offset + thread_offset];
-            thread_offset += BLOCK_THREADS;
-        }
-
-        // Continue reading items (block-striped)
-        while (thread_offset < valid_items)
-        {
-            OutputT item        (d_wrapped_in[block_offset + thread_offset]);
-            thread_aggregate    = reduction_op(thread_aggregate, item);
-            thread_offset       += BLOCK_THREADS;
-        }
+      thread_aggregate = reduction_op(thread_aggregate, item);
+      thread_offset += BLOCK_THREADS;
     }
-
-
-    //---------------------------------------------------------------
-    // Consume a contiguous segment of tiles
-    //---------------------------------------------------------------------
-
-    /**
-     * \brief Reduce a contiguous segment of input tiles
-     */
-    template <int CAN_VECTORIZE>
-    __device__ __forceinline__ OutputT ConsumeRange(
-        GridEvenShare<OffsetT> &even_share,          ///< GridEvenShare descriptor
-        Int2Type<CAN_VECTORIZE> can_vectorize)      ///< Whether or not we can vectorize loads
+  }
+
+  //---------------------------------------------------------------
+  // Consume a contiguous segment of tiles
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Reduce a contiguous segment of input tiles
+   * @param even_share GridEvenShare descriptor
+   * @param can_vectorize Whether or not we can vectorize loads
+   */
+  template <int CAN_VECTORIZE>
+  __device__ __forceinline__ AccumT
+  ConsumeRange(GridEvenShare<OffsetT> &even_share,
+               Int2Type<CAN_VECTORIZE> can_vectorize)
+  {
+    AccumT thread_aggregate{};
+
+    if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
     {
-        OutputT thread_aggregate;
-
-        if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
-        {
-            // First tile isn't full (not all threads have valid items)
-            int valid_items = even_share.block_end - even_share.block_offset;
-            ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
-            return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
-        }
-
-        // At least one full block
-        ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
-        even_share.block_offset += even_share.block_stride;
-
-        // Consume subsequent full tiles of input
-        while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
-        {
-            ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
-            even_share.block_offset += even_share.block_stride;
-        }
-
-        // Consume a partially-full tile
-        if (even_share.block_offset < even_share.block_end)
-        {
-            int valid_items = even_share.block_end - even_share.block_offset;
-            ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
-        }
-
-        // Compute block-wide reduction (all threads have valid items)
-        return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
+      // First tile isn't full (not all threads have valid items)
+      int valid_items = even_share.block_end - even_share.block_offset;
+      ConsumeTile<true>(thread_aggregate,
+                        even_share.block_offset,
+                        valid_items,
+                        Int2Type<false>(),
+                        can_vectorize);
+      return BlockReduceT(temp_storage.reduce)
+        .Reduce(thread_aggregate, reduction_op, valid_items);
     }
 
+    // At least one full block
+    ConsumeTile<true>(thread_aggregate,
+                      even_share.block_offset,
+                      TILE_ITEMS,
+                      Int2Type<true>(),
+                      can_vectorize);
+    even_share.block_offset += even_share.block_stride;
 
-    /**
-     * \brief Reduce a contiguous segment of input tiles
-     */
-    __device__ __forceinline__ OutputT ConsumeRange(
-        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
+    // Consume subsequent full tiles of input
+    while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
     {
-        GridEvenShare<OffsetT> even_share;
-        even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);
-
-        return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
-            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
-            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
+      ConsumeTile<false>(thread_aggregate,
+                         even_share.block_offset,
+                         TILE_ITEMS,
+                         Int2Type<true>(),
+                         can_vectorize);
+      even_share.block_offset += even_share.block_stride;
     }
 
-
-    /**
-     * Reduce a contiguous segment of input tiles
-     */
-    __device__ __forceinline__ OutputT ConsumeTiles(
-        GridEvenShare<OffsetT> &even_share)        ///< [in] GridEvenShare descriptor
+    // Consume a partially-full tile
+    if (even_share.block_offset < even_share.block_end)
     {
-        // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
-        even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
-
-        return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
-            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
-            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
-
+      int valid_items = even_share.block_end - even_share.block_offset;
+      ConsumeTile<false>(thread_aggregate,
+                         even_share.block_offset,
+                         valid_items,
+                         Int2Type<false>(),
+                         can_vectorize);
     }
 
+    // Compute block-wide reduction (all threads have valid items)
+    return BlockReduceT(temp_storage.reduce)
+      .Reduce(thread_aggregate, reduction_op);
+  }
+
+  /**
+   * @brief Reduce a contiguous segment of input tiles
+   * @param[in] block_offset Threadblock begin offset (inclusive)
+   * @param[in] block_end Threadblock end offset (exclusive)
+   */
+  __device__ __forceinline__ AccumT ConsumeRange(OffsetT block_offset,
+                                                 OffsetT block_end)
+  {
+    GridEvenShare<OffsetT> even_share;
+    even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);
+
+    return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>()))
+             ? ConsumeRange(even_share,
+                            Int2Type < true && ATTEMPT_VECTORIZATION > ())
+             : ConsumeRange(even_share,
+                            Int2Type < false && ATTEMPT_VECTORIZATION > ());
+  }
+
+  /**
+   * Reduce a contiguous segment of input tiles
+   * @param[in] even_share GridEvenShare descriptor
+   */
+  __device__ __forceinline__ AccumT
+  ConsumeTiles(GridEvenShare<OffsetT> &even_share)
+  {
+    // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread
+    // block
+    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
+
+    return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>()))
+             ? ConsumeRange(even_share,
+                            Int2Type < true && ATTEMPT_VECTORIZATION > ())
+             : ConsumeRange(even_share,
+                            Int2Type < false && ATTEMPT_VECTORIZATION > ());
+  }
 };
 
-
 CUB_NAMESPACE_END
 
diff --git a/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/block/specializations/block_reduce_warp_reductions.cuh
index c341c8ba55..4fec6cad1b 100644
--- a/cub/block/specializations/block_reduce_warp_reductions.cuh
+++ b/cub/block/specializations/block_reduce_warp_reductions.cuh
@@ -143,7 +143,7 @@ struct BlockReduceWarpReductions
         // Share lane aggregates
         if (lane_id == 0)
         {
-            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+            new (temp_storage.warp_aggregates + warp_id) T(warp_aggregate);
         }
 
         CTA_SYNC();
diff --git a/cub/detail/type_traits.cuh b/cub/detail/type_traits.cuh
index 9b28e2c959..c68e4248a7 100644
--- a/cub/detail/type_traits.cuh
+++ b/cub/detail/type_traits.cuh
@@ -32,10 +32,10 @@
 
 #pragma once
 
-#include "../util_cpp_dialect.cuh"
-#include "../util_namespace.cuh"
+#include <cub/util_cpp_dialect.cuh>
+#include <cub/util_namespace.cuh>
 
-#include <type_traits>
+#include <cuda/std/type_traits>
 
 
 CUB_NAMESPACE_BEGIN
@@ -44,11 +44,15 @@ namespace detail {
 template <typename Invokable, typename... Args>
 using invoke_result_t =
 #if CUB_CPP_DIALECT < 2017
-  typename std::result_of<Invokable(Args...)>::type;
+  typename cuda::std::result_of<Invokable(Args...)>::type;
 #else // 2017+
-  std::invoke_result_t<Invokable, Args...>;
+  cuda::std::invoke_result_t<Invokable, Args...>;
 #endif
 
+/// The type of intermediate accumulator (according to P2322R6)
+template <typename Invokable, typename InitT, typename InputT>
+using accumulator_t = 
+  typename cuda::std::decay<invoke_result_t<Invokable, InitT, InputT>>::type;
 
 } // namespace detail
 CUB_NAMESPACE_END
diff --git a/cub/device/device_reduce.cuh b/cub/device/device_reduce.cuh
index 9f70a111a4..4b10556de3 100644
--- a/cub/device/device_reduce.cuh
+++ b/cub/device/device_reduce.cuh
@@ -155,7 +155,7 @@ struct DeviceReduce
         // Signed integer type for global offsets
         typedef int OffsetT;
 
-        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT>::Dispatch(
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, T>::Dispatch(
             d_temp_storage,
             temp_storage_bytes,
             d_in,
@@ -239,14 +239,16 @@ struct DeviceReduce
           cub::detail::non_void_value_t<OutputIteratorT,
                                         cub::detail::value_t<InputIteratorT>>;
 
-        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(
+        using InitT = OutputT; 
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum, InitT>::Dispatch(
             d_temp_storage,
             temp_storage_bytes,
             d_in,
             d_out,
             num_items,
             cub::Sum(),
-            OutputT(),            // zero-initialize
+            InitT{},            // zero-initialize
             stream,
             debug_synchronous);
     }
@@ -314,14 +316,16 @@ struct DeviceReduce
         // The input value type
         using InputT = cub::detail::value_t<InputIteratorT>;
 
-        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(
+        using InitT = InputT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min, InitT>::Dispatch(
             d_temp_storage,
             temp_storage_bytes,
             d_in,
             d_out,
             num_items,
             cub::Min(),
-            Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+            Traits<InitT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
             stream,
             debug_synchronous);
     }
@@ -396,6 +400,8 @@ struct DeviceReduce
           cub::detail::non_void_value_t<OutputIteratorT,
                                         KeyValuePair<OffsetT, InputValueT>>;
 
+        using InitT = OutputTupleT;
+
         // The output value type
         using OutputValueT = typename OutputTupleT::Value;
 
@@ -406,9 +412,9 @@ struct DeviceReduce
         ArgIndexInputIteratorT d_indexed_in(d_in);
 
         // Initial value
-        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+        InitT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
 
-        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
+        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin, InitT>::Dispatch(
             d_temp_storage,
             temp_storage_bytes,
             d_indexed_in,
@@ -483,14 +489,16 @@ struct DeviceReduce
         // The input value type
         using InputT = cub::detail::value_t<InputIteratorT>;
 
-        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(
+        using InitT = InputT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max, InitT>::Dispatch(
             d_temp_storage,
             temp_storage_bytes,
             d_in,
             d_out,
             num_items,
             cub::Max(),
-            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+            Traits<InitT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
             stream,
             debug_synchronous);
     }
@@ -568,6 +576,8 @@ struct DeviceReduce
         // The output value type
         using OutputValueT = typename OutputTupleT::Value;
 
+        using InitT = OutputTupleT;
+
         // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
         using ArgIndexInputIteratorT =
           ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
@@ -575,9 +585,9 @@ struct DeviceReduce
         ArgIndexInputIteratorT d_indexed_in(d_in);
 
         // Initial value
-        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+        InitT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
 
-        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
+        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax, InitT>::Dispatch(
             d_temp_storage,
             temp_storage_bytes,
             d_indexed_in,
diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh
index 0ac3b7b3c0..9b0148665b 100644
--- a/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/device/dispatch/dispatch_reduce.cuh
@@ -1,7 +1,6 @@
-
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -14,10 +13,10 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
@@ -28,8 +27,9 @@
  ******************************************************************************/
 
 /**
- * \file
- * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
+ * @file cub::DeviceReduce provides device-wide, parallel operations for 
+ *       computing a reduction across a sequence of data items residing within 
+ *       device-accessible memory.
  */
 
 #pragma once
@@ -37,14 +37,13 @@
 #include <stdio.h>
 #include <iterator>
 
-#include "../../agent/agent_reduce.cuh"
-#include "../../iterator/arg_index_input_iterator.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_even_share.cuh"
-#include "../../iterator/arg_index_input_iterator.cuh"
-#include "../../config.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
+#include <cub/agent/agent_reduce.cuh>
+#include <cub/config.cuh>
+#include <cub/grid/grid_even_share.cuh>
+#include <cub/iterator/arg_index_input_iterator.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_debug.cuh>
+#include <cub/util_device.cuh>
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
@@ -55,788 +54,1247 @@ CUB_NAMESPACE_BEGIN
  *****************************************************************************/
 
 /**
- * Reduce region kernel entry point (multi-block).  Computes privatized reductions, one per thread block.
+ * @brief Reduce region kernel entry point (multi-block). Computes privatized 
+ *        reductions, one per thread block.
+ *
+ * @tparam ChainedPolicyT 
+ *   Chained tuning policy
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for reading input items \iterator
+ * 
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ * 
+ * @tparam ReductionOpT
+ *   Binary reduction functor type having member 
+ *   `auto operator()(const T &a, const U &b)`
+ * 
+ * @tparam InitT
+ *   Initial value type
+ * 
+ * @tparam AccumT
+ *   Accumulator type
+ * 
+ * @param[in] d_in 
+ *   Pointer to the input sequence of data items
+ * 
+ * @param[out] d_out 
+ *   Pointer to the output aggregate
+ * 
+ * @param[in] num_items 
+ *   Total number of input data items
+ * 
+ * @param[in] even_share 
+ *   Even-share descriptor for mapping an equal number of tiles onto each 
+ *   thread block
+ * 
+ * @param[in] reduction_op 
+ *   Binary reduction functor
  */
-template <
-    typename                ChainedPolicyT,             ///< Chained tuning policy
-    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
-    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
-    typename                OffsetT,                    ///< Signed integer type for global offsets
-    typename                ReductionOpT>               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
-__global__ void DeviceReduceKernel(
-    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
-    OffsetT                 num_items,                  ///< [in] Total number of input data items
-    GridEvenShare<OffsetT>  even_share,                 ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
-    ReductionOpT            reduction_op)               ///< [in] Binary reduction functor
+template <typename ChainedPolicyT,
+          typename InputIteratorT,
+          typename OffsetT,
+          typename ReductionOpT,
+          typename AccumT>
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) 
+__global__ void DeviceReduceKernel(InputIteratorT d_in,
+                                   AccumT* d_out,
+                                   OffsetT num_items,
+                                   GridEvenShare<OffsetT> even_share,
+                                   ReductionOpT reduction_op)
 {
-    // The output value type
-    using OutputT =
-      cub::detail::non_void_value_t<OutputIteratorT,
-                                    cub::detail::value_t<InputIteratorT>>;
-
-    // Thread block type for reducing input tiles
-    using AgentReduceT =
-      AgentReduce<typename ChainedPolicyT::ActivePolicy::ReducePolicy,
-                  InputIteratorT,
-                  OutputIteratorT,
-                  OffsetT,
-                  ReductionOpT>;
-
-    // Shared memory storage
-    __shared__ typename AgentReduceT::TempStorage temp_storage;
-
-    // Consume input tiles
-    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share);
-
-    // Output result
-    if (threadIdx.x == 0)
-        d_out[blockIdx.x] = block_aggregate;
+  // Thread block type for reducing input tiles
+  using AgentReduceT =
+    AgentReduce<typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+                InputIteratorT,
+                AccumT*,
+                OffsetT,
+                ReductionOpT,
+                AccumT>;
+
+  // Shared memory storage
+  __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+  // Consume input tiles
+  AccumT block_aggregate =
+    AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share);
+
+  // Output result
+  if (threadIdx.x == 0)
+  {
+    new (d_out + blockIdx.x) AccumT(block_aggregate);
+  }
 }
 
-
 /**
- * Reduce a single tile kernel entry point (single-block).  Can be used to aggregate privatized thread block reductions from a previous multi-block reduction pass.
+ * @brief Reduce a single tile kernel entry point (single-block). Can be used 
+ *        to aggregate privatized thread block reductions from a previous 
+ *        multi-block reduction pass.
+ *
+ * @tparam ChainedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for reading input items \iterator
+ *
+ * @tparam OutputIteratorT
+ *   Output iterator type for recording the reduced aggregate \iterator
+ * 
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ * 
+ * @tparam ReductionOpT
+ *   Binary reduction functor type having member 
+ *   `T operator()(const T &a, const U &b)`
+ * 
+ * @tparam InitT
+ *   Initial value type
+ *
+ * @tparam AccumT
+ *   Accumulator type 
+ *
+ * @param[in] d_in 
+ *   Pointer to the input sequence of data items
+ *
+ * @param[out] d_out 
+ *   Pointer to the output aggregate
+ *
+ * @param[in] num_items 
+ *   Total number of input data items
+ *
+ * @param[in] reduction_op 
+ *   Binary reduction functor
+ *
+ * @param[in] init 
+ *   The initial value of the reduction
  */
-template <
-    typename                ChainedPolicyT,             ///< Chained tuning policy
-    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
-    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
-    typename                OffsetT,                    ///< Signed integer type for global offsets
-    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename                OutputT>                     ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
-__global__ void DeviceReduceSingleTileKernel(
-    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
-    OffsetT                 num_items,                  ///< [in] Total number of input data items
-    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor
-    OutputT                  init)                       ///< [in] The initial value of the reduction
+template <typename ChainedPolicyT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename OffsetT,
+          typename ReductionOpT,
+          typename InitT,
+          typename AccumT>
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) 
+__global__ void DeviceReduceSingleTileKernel(InputIteratorT d_in,
+                                             OutputIteratorT d_out,
+                                             OffsetT num_items,
+                                             ReductionOpT reduction_op,
+                                             InitT init)
 {
-    // Thread block type for reducing input tiles
-    typedef AgentReduce<
-            typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
-            InputIteratorT,
-            OutputIteratorT,
-            OffsetT,
-            ReductionOpT>
-        AgentReduceT;
-
-    // Shared memory storage
-    __shared__ typename AgentReduceT::TempStorage temp_storage;
-
-    // Check if empty problem
-    if (num_items == 0)
+  // Thread block type for reducing input tiles
+  using AgentReduceT =
+    AgentReduce<typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
+                InputIteratorT,
+                OutputIteratorT,
+                OffsetT,
+                ReductionOpT,
+                AccumT>;
+
+  // Shared memory storage
+  __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+  // Check if empty problem
+  if (num_items == 0)
+  {
+    if (threadIdx.x == 0)
     {
-        if (threadIdx.x == 0)
-            *d_out = init;
-        return;
+      *d_out = init;
     }
 
-    // Consume input tiles
-    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
-        OffsetT(0),
-        num_items);
+    return;
+  }
 
-    // Output result
-    if (threadIdx.x == 0)
-        *d_out = reduction_op(init, block_aggregate);
-}
+  // Consume input tiles
+  AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op)
+                             .ConsumeRange(OffsetT(0), num_items);
 
+  // Output result
+  if (threadIdx.x == 0)
+  {
+    *d_out = reduction_op(init, block_aggregate);
+  }
+}
 
 /// Normalize input iterator to segment offset
 template <typename T, typename OffsetT, typename IteratorT>
-__device__ __forceinline__
-void NormalizeReductionOutput(
-    T &/*val*/,
-    OffsetT /*base_offset*/,
-    IteratorT /*itr*/)
+__device__ __forceinline__ void NormalizeReductionOutput(T & /*val*/,
+                                                         OffsetT /*base_offset*/,
+                                                         IteratorT /*itr*/)
 {}
 
-
 /// Normalize input iterator to segment offset (specialized for arg-index)
-template <typename KeyValuePairT, typename OffsetT, typename WrappedIteratorT, typename OutputValueT>
-__device__ __forceinline__
-void NormalizeReductionOutput(
-    KeyValuePairT &val,
-    OffsetT base_offset,
-    ArgIndexInputIterator<WrappedIteratorT, OffsetT, OutputValueT> /*itr*/)
+template <typename KeyValuePairT,
+          typename OffsetT,
+          typename WrappedIteratorT,
+          typename OutputValueT>
+__device__ __forceinline__ void NormalizeReductionOutput(
+  KeyValuePairT &val,
+  OffsetT base_offset,
+  ArgIndexInputIterator<WrappedIteratorT, OffsetT, OutputValueT> /*itr*/)
 {
-    val.key -= base_offset;
+  val.key -= base_offset;
 }
 
-
 /**
  * Segmented reduction (one block per segment)
+ * @tparam ChainedPolicyT
+ *   Chained tuning policy
+ * 
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for reading input items \iterator
+ *
+ * @tparam OutputIteratorT
+ *   Output iterator type for recording the reduced aggregate \iterator
+ *
+ * @tparam BeginOffsetIteratorT
+ *   Random-access input iterator type for reading segment beginning offsets 
+ *   \iterator
+ *
+ * @tparam EndOffsetIteratorT
+ *   Random-access input iterator type for reading segment ending offsets 
+ *   \iterator
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam ReductionOpT
+ *   Binary reduction functor type having member 
+ *   `T operator()(const T &a, const U &b)`
+ *
+ * @tparam InitT
+ *   Initial value type
+ *
+ * @param[in] d_in 
+ *   Pointer to the input sequence of data items
+ *
+ * @param[out] d_out 
+ *   Pointer to the output aggregate
+ *
+ * @param[in] d_begin_offsets 
+ *   Random-access input iterator to the sequence of beginning offsets of 
+ *   length `num_segments`, such that `d_begin_offsets[i]` is the first element 
+ *   of the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`
+ *
+ * @param[in] d_end_offsets 
+ *   Random-access input iterator to the sequence of ending offsets of length 
+ *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
+ *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. 
+ *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+ *   considered empty.
+ *
+ * @param[in] num_segments 
+ *   The number of segments that comprise the sorting data
+ *
+ * @param[in] reduction_op 
+ *   Binary reduction functor
+ *
+ * @param[in] init 
+ *   The initial value of the reduction
  */
-template <
-    typename                ChainedPolicyT,             ///< Chained tuning policy
-    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
-    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
-    typename                BeginOffsetIteratorT,       ///< Random-access input iterator type for reading segment beginning offsets \iterator
-    typename                EndOffsetIteratorT,         ///< Random-access input iterator type for reading segment ending offsets \iterator
-    typename                OffsetT,                    ///< Signed integer type for global offsets
-    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename                OutputT>                    ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
+template <typename ChainedPolicyT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename OffsetT,
+          typename ReductionOpT,
+          typename InitT,
+          typename AccumT>
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) 
 __global__ void DeviceSegmentedReduceKernel(
-    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
-    BeginOffsetIteratorT    d_begin_offsets,            ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-    EndOffsetIteratorT      d_end_offsets,              ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-    int                     /*num_segments*/,           ///< [in] The number of segments that comprise the sorting data
-    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor
-    OutputT                 init)                       ///< [in] The initial value of the reduction
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int /*num_segments*/,
+    ReductionOpT reduction_op,
+    InitT init)
 {
-    // Thread block type for reducing input tiles
-    typedef AgentReduce<
-            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
-            InputIteratorT,
-            OutputIteratorT,
-            OffsetT,
-            ReductionOpT>
-        AgentReduceT;
-
-    // Shared memory storage
-    __shared__ typename AgentReduceT::TempStorage temp_storage;
-
-    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
-    OffsetT segment_end     = d_end_offsets[blockIdx.x];
-
-    // Check if empty problem
-    if (segment_begin == segment_end)
+  // Thread block type for reducing input tiles
+  using AgentReduceT =
+    AgentReduce<typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+                InputIteratorT,
+                OutputIteratorT,
+                OffsetT,
+                ReductionOpT,
+                AccumT>;
+
+  // Shared memory storage
+  __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+  OffsetT segment_begin = d_begin_offsets[blockIdx.x];
+  OffsetT segment_end   = d_end_offsets[blockIdx.x];
+
+  // Check if empty problem
+  if (segment_begin == segment_end)
+  {
+    if (threadIdx.x == 0)
     {
-        if (threadIdx.x == 0)
-            d_out[blockIdx.x] = init;
-        return;
+      d_out[blockIdx.x] = init;
     }
+    return;
+  }
 
-    // Consume input tiles
-    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
-        segment_begin,
-        segment_end);
+  // Consume input tiles
+  AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op)
+                             .ConsumeRange(segment_begin, segment_end);
 
-    // Normalize as needed
-    NormalizeReductionOutput(block_aggregate, segment_begin, d_in);
+  // Normalize as needed
+  NormalizeReductionOutput(block_aggregate, segment_begin, d_in);
 
-    if (threadIdx.x == 0)
-        d_out[blockIdx.x] = reduction_op(init, block_aggregate);;
+  if (threadIdx.x == 0)
+  {
+    d_out[blockIdx.x] = reduction_op(init, block_aggregate);
+  }
 }
 
-
-
-
 /******************************************************************************
  * Policy
  ******************************************************************************/
 
+/**
+ * @tparam AccumT
+ *   Accumulator data type
+ *
+ * OffsetT
+ *   Signed integer type for global offsets
+ *
+ * ReductionOpT
+ *   Binary reduction functor type having member 
+ *   `auto operator()(const T &a, const U &b)`
+ */
 template <
-    typename InputT,            ///< Input data type
-    typename OutputT,           ///< Compute/output data type
-    typename OffsetT,           ///< Signed integer type for global offsets
-    typename ReductionOpT>      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename AccumT,            
+    typename OffsetT,          
+    typename ReductionOpT>    
 struct DeviceReducePolicy
 {
-    //------------------------------------------------------------------------------
-    // Architecture-specific tuning policies
-    //------------------------------------------------------------------------------
-
-    /// SM30
-    struct Policy300 : ChainedPolicy<300, Policy300, Policy300>
-    {
-        // ReducePolicy (GTX670: 154.0 @ 48M 4B items)
-        typedef AgentReducePolicy<
-                256, 20, InputT,                       ///< Threads per block, items per thread, compute type, compute type
-                2,                                      ///< Number of items per vectorized load
-                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT>                           ///< Cache load modifier
-            ReducePolicy;
-
-        // SingleTilePolicy
-        typedef ReducePolicy SingleTilePolicy;
-
-        // SegmentedReducePolicy
-        typedef ReducePolicy SegmentedReducePolicy;
-    };
-
-
-    /// SM35
-    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
-    {
-        // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items)
-        typedef AgentReducePolicy<
-                256, 20, InputT,                       ///< Threads per block, items per thread, compute type
-                4,                                      ///< Number of items per vectorized load
-                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
-                LOAD_LDG>                               ///< Cache load modifier
-            ReducePolicy;
-
-        // SingleTilePolicy
-        typedef ReducePolicy SingleTilePolicy;
-
-        // SegmentedReducePolicy
-        typedef ReducePolicy SegmentedReducePolicy;
-    };
-
-    /// SM60
-    struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
-    {
-        // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
-        typedef AgentReducePolicy<
-                256, 16, InputT,                       ///< Threads per block, items per thread, compute type
-                4,                                      ///< Number of items per vectorized load
-                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
-                LOAD_LDG>                               ///< Cache load modifier
-            ReducePolicy;
-
-        // SingleTilePolicy
-        typedef ReducePolicy SingleTilePolicy;
-
-        // SegmentedReducePolicy
-        typedef ReducePolicy SegmentedReducePolicy;
-    };
-
-
-    /// MaxPolicy
-    typedef Policy600 MaxPolicy;
-
+  //---------------------------------------------------------------------------
+  // Architecture-specific tuning policies
+  //---------------------------------------------------------------------------
+
+  /// SM30
+  struct Policy300 : ChainedPolicy<300, Policy300, Policy300>
+  {
+    static constexpr int threads_per_block  = 256;
+    static constexpr int items_per_thread   = 20;
+    static constexpr int items_per_vec_load = 2;
+
+    // ReducePolicy (GTX670: 154.0 @ 48M 4B items)
+    using ReducePolicy = AgentReducePolicy<threads_per_block,
+                                           items_per_thread,
+                                           AccumT,
+                                           items_per_vec_load,
+                                           BLOCK_REDUCE_WARP_REDUCTIONS,
+                                           LOAD_DEFAULT>;
+
+    // SingleTilePolicy
+    using SingleTilePolicy = ReducePolicy;
+
+    // SegmentedReducePolicy
+    using SegmentedReducePolicy = ReducePolicy;
+  };
+
+  /// SM35
+  struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+  {
+    static constexpr int threads_per_block  = 256;
+    static constexpr int items_per_thread   = 20;
+    static constexpr int items_per_vec_load = 4;
+
+    // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B
+    // items)
+    using ReducePolicy = AgentReducePolicy<threads_per_block,
+                                           items_per_thread,
+                                           AccumT,
+                                           items_per_vec_load,
+                                           BLOCK_REDUCE_WARP_REDUCTIONS,
+                                           LOAD_LDG>;
+
+    // SingleTilePolicy
+    using SingleTilePolicy = ReducePolicy;
+
+    // SegmentedReducePolicy
+    using SegmentedReducePolicy = ReducePolicy;
+  };
+
+  /// SM60
+  struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
+  {
+    static constexpr int threads_per_block  = 256;
+    static constexpr int items_per_thread   = 16;
+    static constexpr int items_per_vec_load = 4;
+
+    // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
+    using ReducePolicy = AgentReducePolicy<threads_per_block,
+                                           items_per_thread,
+                                           AccumT,
+                                           items_per_vec_load,
+                                           BLOCK_REDUCE_WARP_REDUCTIONS,
+                                           LOAD_LDG>;
+
+    // SingleTilePolicy
+    using SingleTilePolicy = ReducePolicy;
+
+    // SegmentedReducePolicy
+    using SegmentedReducePolicy = ReducePolicy;
+  };
+
+  using MaxPolicy = Policy600;
 };
 
 
 
 /******************************************************************************
  * Single-problem dispatch
- ******************************************************************************/
+ *****************************************************************************/
 
 /**
- * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
+ * @brief Utility class for dispatching the appropriately-tuned kernels for 
+ *        device-wide reduction
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for reading input items \iterator
+ *
+ * @tparam OutputIteratorT
+ *   Output iterator type for recording the reduced aggregate \iterator
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam ReductionOpT
+ *   Binary reduction functor type having member 
+ *   `auto operator()(const T &a, const U &b)`
+ *
+ * @tparam InitT 
+ *   Initial value type
  */
 template <
-    typename InputIteratorT,    ///< Random-access input iterator type for reading input items \iterator
-    typename OutputIteratorT,   ///< Output iterator type for recording the reduced aggregate \iterator
-    typename OffsetT,           ///< Signed integer type for global offsets
-    typename ReductionOpT,      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename OutputT =          ///< Data type of the output iterator
-        cub::detail::non_void_value_t<
-          OutputIteratorT,
-          cub::detail::value_t<InputIteratorT>>,
-    typename SelectedPolicy = DeviceReducePolicy<
-        cub::detail::value_t<InputIteratorT>,
-        OutputT,
-        OffsetT,
-        ReductionOpT> >
-struct DispatchReduce :
-    SelectedPolicy
+  typename InputIteratorT,
+  typename OutputIteratorT,
+  typename OffsetT,
+  typename ReductionOpT,
+  typename InitT = 
+    cub::detail::non_void_value_t<
+      OutputIteratorT, 
+      cub::detail::value_t<InputIteratorT>>,
+  typename AccumT = 
+    detail::accumulator_t<
+      ReductionOpT, 
+      InitT, 
+      cub::detail::value_t<InputIteratorT>>,
+  typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
+struct DispatchReduce : SelectedPolicy
 {
-    //------------------------------------------------------------------------------
-    // Problem state
-    //------------------------------------------------------------------------------
-
-    void                *d_temp_storage;                ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t              &temp_storage_bytes;            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-    InputIteratorT      d_in;                           ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT     d_out;                          ///< [out] Pointer to the output aggregate
-    OffsetT             num_items;                      ///< [in] Total number of input items (i.e., length of \p d_in)
-    ReductionOpT        reduction_op;                   ///< [in] Binary reduction functor
-    OutputT             init;                           ///< [in] The initial value of the reduction
-    cudaStream_t        stream;                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-    bool                debug_synchronous;              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    int                 ptx_version;                    ///< [in] PTX version
-
-    //------------------------------------------------------------------------------
-    // Constructor
-    //------------------------------------------------------------------------------
-
-    /// Constructor
-    CUB_RUNTIME_FUNCTION __forceinline__
-    DispatchReduce(
-        void*                   d_temp_storage,
-        size_t                  &temp_storage_bytes,
-        InputIteratorT          d_in,
-        OutputIteratorT         d_out,
-        OffsetT                 num_items,
-        ReductionOpT            reduction_op,
-        OutputT                 init,
-        cudaStream_t            stream,
-        bool                    debug_synchronous,
-        int                     ptx_version)
-    :
-        d_temp_storage(d_temp_storage),
-        temp_storage_bytes(temp_storage_bytes),
-        d_in(d_in),
-        d_out(d_out),
-        num_items(num_items),
-        reduction_op(reduction_op),
-        init(init),
-        stream(stream),
-        debug_synchronous(debug_synchronous),
-        ptx_version(ptx_version)
-    {}
-
-
-    //------------------------------------------------------------------------------
-    // Small-problem (single tile) invocation
-    //------------------------------------------------------------------------------
-
-    /// Invoke a single block block to reduce in-core
-    template <
-        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
-        typename                SingleTileKernelT>      ///< Function type of cub::DeviceReduceSingleTileKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokeSingleTile(
-        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
-    {
+  //---------------------------------------------------------------------------
+  // Problem state
+  //---------------------------------------------------------------------------
+
+  /// Device-accessible allocation of temporary storage. When `nullptr`, the
+  /// required allocation size is written to `temp_storage_bytes` and no work
+  /// is done.
+  void *d_temp_storage;
+
+  /// Reference to size in bytes of `d_temp_storage` allocation
+  size_t &temp_storage_bytes;
+
+  /// Pointer to the input sequence of data items
+  InputIteratorT d_in;
+
+  /// Pointer to the output aggregate
+  OutputIteratorT d_out;
+
+  /// Total number of input items (i.e., length of `d_in`)
+  OffsetT num_items;
+
+  /// Binary reduction functor
+  ReductionOpT reduction_op;
+
+  /// The initial value of the reduction
+  InitT init;
+
+  /// CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+  cudaStream_t stream;
+
+  /// Whether or not to synchronize the stream after every kernel launch to
+  /// check for errors. Also causes launch configurations to be printed to the
+  /// console. Default is `false`.
+  bool debug_synchronous;
+
+  int ptx_version;
+
+  //---------------------------------------------------------------------------
+  // Constructor
+  //---------------------------------------------------------------------------
+
+  /// Constructor
+  CUB_RUNTIME_FUNCTION __forceinline__ DispatchReduce(void *d_temp_storage,
+                                                      size_t &temp_storage_bytes,
+                                                      InputIteratorT d_in,
+                                                      OutputIteratorT d_out,
+                                                      OffsetT num_items,
+                                                      ReductionOpT reduction_op,
+                                                      InitT init,
+                                                      cudaStream_t stream,
+                                                      bool debug_synchronous,
+                                                      int ptx_version)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_in(d_in)
+      , d_out(d_out)
+      , num_items(num_items)
+      , reduction_op(reduction_op)
+      , init(init)
+      , stream(stream)
+      , debug_synchronous(debug_synchronous)
+      , ptx_version(ptx_version)
+  {}
+
+  //---------------------------------------------------------------------------
+  // Small-problem (single tile) invocation
+  //---------------------------------------------------------------------------
+
+  /**
+   * @brief Invoke a single block block to reduce in-core
+   *
+   * @tparam ActivePolicyT
+   *   Umbrella policy active for the target device
+   *
+   * @tparam SingleTileKernelT
+   *   Function type of cub::DeviceReduceSingleTileKernel
+   *
+   * @param[in] single_tile_kernel
+   *   Kernel function pointer to parameterization of 
+   *   cub::DeviceReduceSingleTileKernel
+   */
+  template <typename ActivePolicyT, typename SingleTileKernelT>
+  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+  InvokeSingleTile(SingleTileKernelT single_tile_kernel)
+  {
 #ifndef CUB_RUNTIME_ENABLED
-        (void)single_tile_kernel;
+    (void)single_tile_kernel;
 
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
+    // Kernel launch not supported from this device
+    return CubDebug(cudaErrorNotSupported);
 #else
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-            {
-                temp_storage_bytes = 1;
-                break;
-            }
-
-            // Log single_reduce_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Return if the caller is simply requesting the size of the storage
+      // allocation
+      if (d_temp_storage == NULL)
+      {
+        temp_storage_bytes = 1;
+        break;
+      }
+
+      // Log single_reduce_sweep_kernel configuration
+      if (debug_synchronous)
+      {
+        _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
+                "%d items per thread\n",
                 ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
-                (long long) stream,
+                (long long)stream,
                 ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
+      }
 
-            // Invoke single_reduce_sweep_kernel
-            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
-                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
-            ).doit(single_tile_kernel,
-                d_in,
-                d_out,
-                num_items,
-                reduction_op,
-                init);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-        }
-        while (0);
+      // Invoke single_reduce_sweep_kernel
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream)
+        .doit(single_tile_kernel, d_in, d_out, num_items, reduction_op, init);
 
-        return error;
+      // Check for failure to launch
+      if (CubDebug(error = cudaPeekAtLastError()))
+      {
+        break;
+      }
 
-#endif // CUB_RUNTIME_ENABLED
-    }
+      // Sync the stream if specified to flush runtime errors
+      if (debug_synchronous && (CubDebug(error = SyncStream(stream))))
+      {
+        break;
+      }
+    } while (0);
 
+    return error;
 
-    //------------------------------------------------------------------------------
-    // Normal problem size invocation (two-pass)
-    //------------------------------------------------------------------------------
-
-    /// Invoke two-passes to reduce
-    template <
-        typename                ActivePolicyT,              ///< Umbrella policy active for the target device
-        typename                ReduceKernelT,              ///< Function type of cub::DeviceReduceKernel
-        typename                SingleTileKernelT>          ///< Function type of cub::DeviceReduceSingleTileKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePasses(
-        ReduceKernelT           reduce_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel
-        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
-    {
+#endif // CUB_RUNTIME_ENABLED
+  }
+
+  //---------------------------------------------------------------------------
+  // Normal problem size invocation (two-pass)
+  //---------------------------------------------------------------------------
+
+  /**
+   * @brief Invoke two-passes to reduce
+   * @tparam ActivePolicyT
+   *   Umbrella policy active for the target device
+   *
+   * @tparam ReduceKernelT
+   *   Function type of cub::DeviceReduceKernel
+   *
+   * @tparam SingleTileKernelT
+   *   Function type of cub::DeviceReduceSingleTileKernel
+   *
+   * @param[in] reduce_kernel
+   *   Kernel function pointer to parameterization of cub::DeviceReduceKernel
+   *
+   * @param[in] single_tile_kernel 
+   *   Kernel function pointer to parameterization of 
+   *   cub::DeviceReduceSingleTileKernel
+   */
+  template <typename ActivePolicyT,
+            typename ReduceKernelT,
+            typename SingleTileKernelT>
+  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+  InvokePasses(ReduceKernelT reduce_kernel,
+               SingleTileKernelT single_tile_kernel)
+  {
 #ifndef CUB_RUNTIME_ENABLED
-        (void)                  reduce_kernel;
-        (void)                  single_tile_kernel;
+    (void)reduce_kernel;
+    (void)single_tile_kernel;
 
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
+    // Kernel launch not supported from this device
+    return CubDebug(cudaErrorNotSupported);
 #else
 
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Init regular kernel configuration
-            KernelConfig reduce_config;
-            if (CubDebug(error = reduce_config.Init<typename ActivePolicyT::ReducePolicy>(reduce_kernel))) break;
-            int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;
-
-            // Even-share work distribution
-            int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(0);
-            GridEvenShare<OffsetT> even_share;
-            even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size);
-
-            // Temporary storage allocation requirements
-            void* allocations[1] = {};
-            size_t allocation_sizes[1] =
-            {
-                max_blocks * sizeof(OutputT)    // bytes needed for privatized block reductions
-            };
-
-            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                return cudaSuccess;
-            }
-
-            // Alias the allocation for the privatized per-block reductions
-            OutputT *d_block_reductions = (OutputT*) allocations[0];
-
-            // Get grid size for device_reduce_sweep_kernel
-            int reduce_grid_size = even_share.grid_size;
-
-            // Log device_reduce_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Get device ordinal
+      int device_ordinal;
+      if (CubDebug(error = cudaGetDevice(&device_ordinal)))
+        break;
+
+      // Get SM count
+      int sm_count;
+      if (CubDebug(
+            error = cudaDeviceGetAttribute(&sm_count,
+                                           cudaDevAttrMultiProcessorCount,
+                                           device_ordinal)))
+      {
+        break;
+      }
+
+      // Init regular kernel configuration
+      KernelConfig reduce_config;
+      if (CubDebug(
+            error = reduce_config.Init<typename ActivePolicyT::ReducePolicy>(
+              reduce_kernel)))
+      {
+        break;
+      }
+
+      int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;
+
+      // Even-share work distribution
+      int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(0);
+      GridEvenShare<OffsetT> even_share;
+      even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size);
+
+      // Temporary storage allocation requirements
+      void *allocations[1]       = {};
+      size_t allocation_sizes[1] = {
+        max_blocks * sizeof(AccumT) // bytes needed for privatized block
+                                    // reductions
+      };
+
+      // Alias the temporary allocations from the single storage blob (or
+      // compute the necessary size of the blob)
+      if (CubDebug(error = AliasTemporaries(d_temp_storage,
+                                            temp_storage_bytes,
+                                            allocations,
+                                            allocation_sizes)))
+      {
+        break;
+      }
+
+      if (d_temp_storage == NULL)
+      {
+        // Return if the caller is simply requesting the size of the storage
+        // allocation
+        return cudaSuccess;
+      }
+
+      // Alias the allocation for the privatized per-block reductions
+      AccumT *d_block_reductions = (AccumT *)allocations[0];
+
+      // Get grid size for device_reduce_sweep_kernel
+      int reduce_grid_size = even_share.grid_size;
+
+      // Log device_reduce_sweep_kernel configuration
+      if (debug_synchronous)
+      {
+        _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items "
+                "per thread, %d SM occupancy\n",
                 reduce_grid_size,
                 ActivePolicyT::ReducePolicy::BLOCK_THREADS,
-                (long long) stream,
+                (long long)stream,
                 ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
                 reduce_config.sm_occupancy);
+      }
+
+      // Invoke DeviceReduceKernel
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        reduce_grid_size,
+        ActivePolicyT::ReducePolicy::BLOCK_THREADS,
+        0,
+        stream)
+        .doit(reduce_kernel,
+              d_in,
+              d_block_reductions,
+              num_items,
+              even_share,
+              reduction_op);
+
+      // Check for failure to launch
+      if (CubDebug(error = cudaPeekAtLastError()))
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      if (debug_synchronous) 
+      {
+        if (CubDebug(error = SyncStream(stream)))
+        {
+          break;
+        }
+      }
 
-            // Invoke DeviceReduceKernel
-            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
-                reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS,
-                0, stream
-            ).doit(reduce_kernel,
-                d_in,
-                d_block_reductions,
-                num_items,
-                even_share,
-                reduction_op);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Log single_reduce_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+      // Log single_reduce_sweep_kernel configuration
+      if (debug_synchronous)
+      {
+        _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
+                "%d items per thread\n",
                 ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
-                (long long) stream,
+                (long long)stream,
                 ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
-
-            // Invoke DeviceReduceSingleTileKernel
-            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
-                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
-            ).doit(single_tile_kernel,
-                d_block_reductions,
-                d_out,
-                reduce_grid_size,
-                reduction_op,
-                init);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+      }
+
+      // Invoke DeviceReduceSingleTileKernel
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        1,
+        ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
+        0,
+        stream)
+        .doit(single_tile_kernel,
+              d_block_reductions,
+              d_out,
+              reduce_grid_size,
+              reduction_op,
+              init);
+
+      // Check for failure to launch
+      if (CubDebug(error = cudaPeekAtLastError()))
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      if (debug_synchronous)
+      {
+        if (CubDebug(error = SyncStream(stream)))
+        {
+          break;
         }
-        while (0);
+      }
+    } while (0);
 
-        return error;
+    return error;
 
 #endif // CUB_RUNTIME_ENABLED
-
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Chained policy invocation
-    //------------------------------------------------------------------------------
-
-    /// Invocation
-    template <typename ActivePolicyT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t Invoke()
+  }
+
+  //---------------------------------------------------------------------------
+  // Chained policy invocation
+  //---------------------------------------------------------------------------
+
+  /// Invocation
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
+  {
+    typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT;
+    typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
+
+    // Force kernel code-generation in all compiler passes
+    if (num_items <= (SingleTilePolicyT::BLOCK_THREADS *
+                      SingleTilePolicyT::ITEMS_PER_THREAD))
     {
-        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
-        typedef typename DispatchReduce::MaxPolicy          MaxPolicyT;
-
-        // Force kernel code-generation in all compiler passes
-        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
-        {
-            // Small, single tile size
-            return InvokeSingleTile<ActivePolicyT>(
-                DeviceReduceSingleTileKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
-        }
-        else
-        {
-            // Regular size
-            return InvokePasses<ActivePolicyT>(
-                DeviceReduceKernel<typename DispatchReduce::MaxPolicy, InputIteratorT, OutputT*, OffsetT, ReductionOpT>,
-                DeviceReduceSingleTileKernel<MaxPolicyT, OutputT*, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
-        }
+      // Small, single tile size
+      return InvokeSingleTile<ActivePolicyT>(
+        DeviceReduceSingleTileKernel<MaxPolicyT,
+                                     InputIteratorT,
+                                     OutputIteratorT,
+                                     OffsetT,
+                                     ReductionOpT,
+                                     InitT,
+                                     AccumT>);
     }
-
-
-    //------------------------------------------------------------------------------
-    // Dispatch entrypoints
-    //------------------------------------------------------------------------------
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduction
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void            *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
-        OffsetT         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor
-        OutputT         init,                               ///< [in] The initial value of the reduction
-        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    else
     {
-        typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version = 0;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Create dispatch functor
-            DispatchReduce dispatch(
-                d_temp_storage, temp_storage_bytes,
-                d_in, d_out, num_items, reduction_op, init,
-                stream, debug_synchronous, ptx_version);
-
-            // Dispatch to chained policy
-            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
-        }
-        while (0);
-
-        return error;
+      // Regular size
+      return InvokePasses<ActivePolicyT>(
+        DeviceReduceKernel<typename DispatchReduce::MaxPolicy,
+                           InputIteratorT,
+                           OffsetT,
+                           ReductionOpT,
+                           AccumT>,
+        DeviceReduceSingleTileKernel<MaxPolicyT,
+                                     AccumT *,
+                                     OutputIteratorT,
+                                     OffsetT,
+                                     ReductionOpT,
+                                     InitT,
+                                     AccumT>);
     }
+  }
+
+  //---------------------------------------------------------------------------
+  // Dispatch entrypoints
+  //---------------------------------------------------------------------------
+
+  /**
+   * @brief Internal dispatch routine for computing a device-wide reduction
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes 
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in 
+   *   Pointer to the input sequence of data items
+   *
+   * @param[out] d_out 
+   *   Pointer to the output aggregate
+   *
+   * @param[in] num_items 
+   *   Total number of input items (i.e., length of `d_in`)
+   *
+   * @param[in] reduction_op 
+   *   Binary reduction functor
+   *
+   * @param[in] init 
+   *   The initial value of the reduction
+   *
+   * @param[in] stream 
+   *   **[optional]** CUDA stream to launch kernels within. 
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous 
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+  Dispatch(void *d_temp_storage,
+           size_t &temp_storage_bytes,
+           InputIteratorT d_in,
+           OutputIteratorT d_out,
+           OffsetT num_items,
+           ReductionOpT reduction_op,
+           InitT init,
+           cudaStream_t stream,
+           bool debug_synchronous)
+  {
+    typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
+
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      if (CubDebug(error = PtxVersion(ptx_version)))
+      {
+        break;
+      }
+
+      // Create dispatch functor
+      DispatchReduce dispatch(d_temp_storage,
+                              temp_storage_bytes,
+                              d_in,
+                              d_out,
+                              num_items,
+                              reduction_op,
+                              init,
+                              stream,
+                              debug_synchronous,
+                              ptx_version);
+
+      // Dispatch to chained policy
+      if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
 };
 
 
 
 /******************************************************************************
  * Segmented dispatch
- ******************************************************************************/
+ *****************************************************************************/
 
 /**
- * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
+ * @brief Utility class for dispatching the appropriately-tuned kernels for 
+ *        device-wide reduction
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for reading input items \iterator
+ *
+ * @tparam OutputIteratorT
+ *   Output iterator type for recording the reduced aggregate \iterator
+ *
+ * @tparam BeginOffsetIteratorT
+ *   Random-access input iterator type for reading segment beginning offsets 
+ *   \iterator
+ *
+ * @tparam EndOffsetIteratorT
+ *   Random-access input iterator type for reading segment ending offsets 
+ *   \iterator
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam ReductionOpT
+ *   Binary reduction functor type having member 
+ *   `auto operator()(const T &a, const U &b)`
+ *
+ * @tparam InitT 
+ *   value type
  */
 template <
-    typename InputIteratorT,       ///< Random-access input iterator type for reading input items \iterator
-    typename OutputIteratorT,      ///< Output iterator type for recording the reduced aggregate \iterator
-    typename BeginOffsetIteratorT, ///< Random-access input iterator type for reading segment beginning offsets \iterator
-    typename EndOffsetIteratorT,   ///< Random-access input iterator type for reading segment ending offsets \iterator
-    typename OffsetT,              ///< Signed integer type for global offsets
-    typename ReductionOpT,         ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename OutputT =             ///< Data type of the output iterator
-          cub::detail::non_void_value_t<OutputIteratorT,
-                                        cub::detail::value_t<InputIteratorT>>,
-    typename SelectedPolicy = DeviceReducePolicy<
-        cub::detail::value_t<InputIteratorT>,
-        OutputT,
-        OffsetT,
-        ReductionOpT> >
-struct DispatchSegmentedReduce :
-    SelectedPolicy
+  typename InputIteratorT,
+  typename OutputIteratorT,
+  typename BeginOffsetIteratorT,
+  typename EndOffsetIteratorT,
+  typename OffsetT,
+  typename ReductionOpT,
+  typename InitT = 
+    cub::detail::non_void_value_t<
+      OutputIteratorT, 
+      cub::detail::value_t<InputIteratorT>>,
+  typename AccumT = 
+    detail::accumulator_t<
+      ReductionOpT, 
+      InitT, 
+      cub::detail::value_t<InputIteratorT>>,
+  typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
+struct DispatchSegmentedReduce : SelectedPolicy
 {
-    //------------------------------------------------------------------------------
-    // Problem state
-    //------------------------------------------------------------------------------
-
-    void                 *d_temp_storage;        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t               &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-    InputIteratorT       d_in;                   ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT      d_out;                  ///< [out] Pointer to the output aggregate
-    OffsetT              num_segments;           ///< [in] The number of segments that comprise the sorting data
-    BeginOffsetIteratorT d_begin_offsets;        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-    EndOffsetIteratorT   d_end_offsets;          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-    ReductionOpT         reduction_op;           ///< [in] Binary reduction functor
-    OutputT              init;                   ///< [in] The initial value of the reduction
-    cudaStream_t         stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-    bool                 debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    int                  ptx_version;            ///< [in] PTX version
-
-    //------------------------------------------------------------------------------
-    // Constructor
-    //------------------------------------------------------------------------------
-
-    /// Constructor
-    CUB_RUNTIME_FUNCTION __forceinline__
-    DispatchSegmentedReduce(
-        void*                   d_temp_storage,
-        size_t                  &temp_storage_bytes,
-        InputIteratorT          d_in,
-        OutputIteratorT         d_out,
-        OffsetT                 num_segments,
-        BeginOffsetIteratorT    d_begin_offsets,
-        EndOffsetIteratorT      d_end_offsets,
-        ReductionOpT            reduction_op,
-        OutputT                 init,
-        cudaStream_t            stream,
-        bool                    debug_synchronous,
-        int                     ptx_version)
-    :
-        d_temp_storage(d_temp_storage),
-        temp_storage_bytes(temp_storage_bytes),
-        d_in(d_in),
-        d_out(d_out),
-        num_segments(num_segments),
-        d_begin_offsets(d_begin_offsets),
-        d_end_offsets(d_end_offsets),
-        reduction_op(reduction_op),
-        init(init),
-        stream(stream),
-        debug_synchronous(debug_synchronous),
-        ptx_version(ptx_version)
-    {}
-
-
-
-    //------------------------------------------------------------------------------
-    // Chained policy invocation
-    //------------------------------------------------------------------------------
-
-    /// Invocation
-    template <
-        typename                        ActivePolicyT,                  ///< Umbrella policy active for the target device
-        typename                        DeviceSegmentedReduceKernelT>   ///< Function type of cub::DeviceSegmentedReduceKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePasses(
-        DeviceSegmentedReduceKernelT    segmented_reduce_kernel)        ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel
-    {
+  //---------------------------------------------------------------------------
+  // Problem state
+  //---------------------------------------------------------------------------
+
+  /// Device-accessible allocation of temporary storage. When `nullptr`, the
+  /// required allocation size is written to `temp_storage_bytes` and no work
+  /// is done.
+  void *d_temp_storage;
+
+  /// Reference to size in bytes of `d_temp_storage` allocation
+  size_t &temp_storage_bytes;
+
+  /// Pointer to the input sequence of data items
+  InputIteratorT d_in;
+
+  /// Pointer to the output aggregate
+  OutputIteratorT d_out;
+
+  /// The number of segments that comprise the sorting data
+  OffsetT num_segments;
+
+  /// Random-access input iterator to the sequence of beginning offsets of
+  /// length `num_segments`, such that `d_begin_offsets[i]` is the first
+  /// element of the *i*<sup>th</sup> data segment in `d_keys_*` and
+  /// `d_values_*`
+  BeginOffsetIteratorT d_begin_offsets;
+
+  /// Random-access input iterator to the sequence of ending offsets of length
+  /// `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+  /// the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+  /// If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
+  /// considered empty.
+  EndOffsetIteratorT d_end_offsets;
+
+  /// Binary reduction functor
+  ReductionOpT reduction_op;
+
+  /// The initial value of the reduction
+  InitT init;
+
+  /// CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+  cudaStream_t stream;
+
+  /// Whether or not to synchronize the stream after every kernel launch to
+  /// check for errors. Also causes launch configurations to be printed to the
+  /// console. Default is `false`.
+  bool debug_synchronous;
+
+  int ptx_version;
+
+  //---------------------------------------------------------------------------
+  // Constructor
+  //---------------------------------------------------------------------------
+
+  /// Constructor
+  CUB_RUNTIME_FUNCTION __forceinline__
+  DispatchSegmentedReduce(void *d_temp_storage,
+                          size_t &temp_storage_bytes,
+                          InputIteratorT d_in,
+                          OutputIteratorT d_out,
+                          OffsetT num_segments,
+                          BeginOffsetIteratorT d_begin_offsets,
+                          EndOffsetIteratorT d_end_offsets,
+                          ReductionOpT reduction_op,
+                          InitT init,
+                          cudaStream_t stream,
+                          bool debug_synchronous,
+                          int ptx_version)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_in(d_in)
+      , d_out(d_out)
+      , num_segments(num_segments)
+      , d_begin_offsets(d_begin_offsets)
+      , d_end_offsets(d_end_offsets)
+      , reduction_op(reduction_op)
+      , init(init)
+      , stream(stream)
+      , debug_synchronous(debug_synchronous)
+      , ptx_version(ptx_version)
+  {}
+
+  //---------------------------------------------------------------------------
+  // Chained policy invocation
+  //---------------------------------------------------------------------------
+
+  /**
+   * @brief Invocation
+   *
+   * @tparam ActivePolicyT
+   *   Umbrella policy active for the target device
+   *
+   * @tparam DeviceSegmentedReduceKernelT
+   *   Function type of cub::DeviceSegmentedReduceKernel
+   *
+   * @param[in] segmented_reduce_kernel
+   *   Kernel function pointer to parameterization of 
+   *   cub::DeviceSegmentedReduceKernel
+   */
+  template <typename ActivePolicyT, typename DeviceSegmentedReduceKernelT>
+  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+  InvokePasses(DeviceSegmentedReduceKernelT segmented_reduce_kernel)
+  {
 #ifndef CUB_RUNTIME_ENABLED
-        (void)segmented_reduce_kernel;
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
+    (void)segmented_reduce_kernel;
+    // Kernel launch not supported from this device
+    return CubDebug(cudaErrorNotSupported);
 #else
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-            {
-                temp_storage_bytes = 1;
-                return cudaSuccess;
-            }
-
-            // Init kernel configuration
-            KernelConfig segmented_reduce_config;
-            if (CubDebug(error = segmented_reduce_config.Init<typename ActivePolicyT::SegmentedReducePolicy>(segmented_reduce_kernel))) break;
-
-            // Log device_reduce_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+    cudaError error = cudaSuccess;
+
+    do
+    {
+      // Return if the caller is simply requesting the size of the storage
+      // allocation
+      if (d_temp_storage == NULL)
+      {
+        temp_storage_bytes = 1;
+        return cudaSuccess;
+      }
+
+      // Init kernel configuration
+      KernelConfig segmented_reduce_config;
+      if (CubDebug(
+            error = segmented_reduce_config
+                      .Init<typename ActivePolicyT::SegmentedReducePolicy>(
+                        segmented_reduce_kernel)))
+      {
+        break;
+      }
+
+      // Log device_reduce_sweep_kernel configuration
+      if (debug_synchronous)
+      {
+        _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), "
+                "%d items per thread, %d SM occupancy\n",
                 num_segments,
                 ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
-                (long long) stream,
+                (long long)stream,
                 ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
                 segmented_reduce_config.sm_occupancy);
-
-            // Invoke DeviceReduceKernel
-            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
-                num_segments,
-                ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream
-            ).doit(segmented_reduce_kernel,
-                d_in,
-                d_out,
-                d_begin_offsets,
-                d_end_offsets,
-                num_segments,
-                reduction_op,
-                init);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+      }
+
+      // Invoke DeviceReduceKernel
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        num_segments,
+        ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
+        0,
+        stream)
+        .doit(segmented_reduce_kernel,
+              d_in,
+              d_out,
+              d_begin_offsets,
+              d_end_offsets,
+              num_segments,
+              reduction_op,
+              init);
+
+      // Check for failure to launch
+      if (CubDebug(error = cudaPeekAtLastError()))
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      if (debug_synchronous)
+      {
+        if (CubDebug(error = SyncStream(stream)))
+        {
+          break;
         }
-        while (0);
+      }
+    } while (0);
 
-        return error;
+    return error;
 
 #endif // CUB_RUNTIME_ENABLED
-
-    }
-
-
-    /// Invocation
-    template <typename ActivePolicyT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t Invoke()
+  }
+
+  /// Invocation
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
+  {
+    typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
+
+    // Force kernel code-generation in all compiler passes
+    return InvokePasses<ActivePolicyT>(
+      DeviceSegmentedReduceKernel<MaxPolicyT,
+                                  InputIteratorT,
+                                  OutputIteratorT,
+                                  BeginOffsetIteratorT,
+                                  EndOffsetIteratorT,
+                                  OffsetT,
+                                  ReductionOpT,
+                                  InitT,
+                                  AccumT>);
+  }
+
+  //---------------------------------------------------------------------------
+  // Dispatch entrypoints
+  //---------------------------------------------------------------------------
+
+  /**
+   * @brief Internal dispatch routine for computing a device-wide reduction
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in
+   *   Pointer to the input sequence of data items
+   *
+   * @param[out] d_out
+   *   Pointer to the output aggregate
+   *
+   * @param[in] num_segments
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of 
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length 
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. 
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+   *   considered empty.
+   *
+   * @param[in] reduction_op
+   *   Binary reduction functor
+   *
+   * @param[in] init
+   *   The initial value of the reduction
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+  Dispatch(void *d_temp_storage,
+           size_t &temp_storage_bytes,
+           InputIteratorT d_in,
+           OutputIteratorT d_out,
+           int num_segments,
+           BeginOffsetIteratorT d_begin_offsets,
+           EndOffsetIteratorT d_end_offsets,
+           ReductionOpT reduction_op,
+           InitT init,
+           cudaStream_t stream,
+           bool debug_synchronous)
+  {
+    typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
+
+    if (num_segments <= 0)
     {
-        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
-
-        // Force kernel code-generation in all compiler passes
-        return InvokePasses<ActivePolicyT>(
-            DeviceSegmentedReduceKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, ReductionOpT, OutputT>);
+      return cudaSuccess;
     }
 
+    cudaError error = cudaSuccess;
 
-    //------------------------------------------------------------------------------
-    // Dispatch entrypoints
-    //------------------------------------------------------------------------------
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduction
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void                 *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t               &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT       d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT      d_out,                              ///< [out] Pointer to the output aggregate
-        int                  num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        BeginOffsetIteratorT d_begin_offsets,                    ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        EndOffsetIteratorT   d_end_offsets,                      ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        ReductionOpT         reduction_op,                       ///< [in] Binary reduction functor
-        OutputT              init,                               ///< [in] The initial value of the reduction
-        cudaStream_t         stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                 debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    do
     {
-        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
-
-        if (num_segments <= 0)
-            return cudaSuccess;
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version = 0;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Create dispatch functor
-            DispatchSegmentedReduce dispatch(
-                d_temp_storage, temp_storage_bytes,
-                d_in, d_out,
-                num_segments, d_begin_offsets, d_end_offsets,
-                reduction_op, init,
-                stream, debug_synchronous, ptx_version);
-
-            // Dispatch to chained policy
-            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
-        }
-        while (0);
-
-        return error;
-    }
+      // Get PTX version
+      int ptx_version = 0;
+      if (CubDebug(error = PtxVersion(ptx_version)))
+      {
+        break;
+      }
+
+      // Create dispatch functor
+      DispatchSegmentedReduce dispatch(d_temp_storage,
+                                       temp_storage_bytes,
+                                       d_in,
+                                       d_out,
+                                       num_segments,
+                                       d_begin_offsets,
+                                       d_end_offsets,
+                                       reduction_op,
+                                       init,
+                                       stream,
+                                       debug_synchronous,
+                                       ptx_version);
+
+      // Dispatch to chained policy
+      if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
 };
 
 
-
 CUB_NAMESPACE_END
 
-
diff --git a/cub/thread/thread_operators.cuh b/cub/thread/thread_operators.cuh
index c220ecacfe..b032763023 100644
--- a/cub/thread/thread_operators.cuh
+++ b/cub/thread/thread_operators.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -13,10 +13,10 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
@@ -27,7 +27,7 @@
  ******************************************************************************/
 
 /**
- * \file
+ * @file
  * Simple binary operator functor types
  */
 
@@ -37,308 +37,324 @@
 
 #pragma once
 
-#include "../config.cuh"
-#include "../util_type.cuh"
+#include <cub/config.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
 
 CUB_NAMESPACE_BEGIN
 
 
 /**
- * \addtogroup UtilModule
+ * @addtogroup UtilModule
  * @{
  */
 
-/**
- * \brief Default equality functor
- */
+/// @brief Default equality functor
 struct Equality
 {
-    /// Boolean equality operator, returns <tt>(a == b)</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
-    {
-        return a == b;
-    }
+  /// Boolean equality operator, returns `t == u`
+  template <typename T, typename U>
+  __host__ __device__ __forceinline__ bool operator()(T &&t, U &&u) const
+  {
+    return cuda::std::forward<T>(t) == cuda::std::forward<U>(u);
+  }
 };
 
-
-/**
- * \brief Default inequality functor
- */
+/// @brief Default inequality functor
 struct Inequality
 {
-    /// Boolean inequality operator, returns <tt>(a != b)</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
-    {
-        return a != b;
-    }
+  /// Boolean inequality operator, returns `t != u`
+  template <typename T, typename U>
+  __host__ __device__ __forceinline__ bool operator()(T &&t, U &&u) const
+  {
+    return cuda::std::forward<T>(t) != cuda::std::forward<U>(u);
+  }
 };
 
-
-/**
- * \brief Inequality functor (wraps equality functor)
- */
+/// @brief Inequality functor (wraps equality functor)
 template <typename EqualityOp>
 struct InequalityWrapper
 {
-    /// Wrapped equality operator
-    EqualityOp op;
+  /// Wrapped equality operator
+  EqualityOp op;
 
-    /// Constructor
-    __host__ __device__ __forceinline__
-    InequalityWrapper(EqualityOp op) : op(op) {}
+  /// Constructor
+  __host__ __device__ __forceinline__ InequalityWrapper(EqualityOp op)
+      : op(op)
+  {}
 
-    /// Boolean inequality operator, returns <tt>(a != b)</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
-    {
-        return !op(a, b);
-    }
+  /// Boolean inequality operator, returns `t != u`
+  template <typename T, typename U>
+  __host__ __device__ __forceinline__ bool operator()(T &&t, U &&u)
+  {
+    return !op(std::forward<T>(t), std::forward<U>(u));
+  }
 };
 
-
-/**
- * \brief Default sum functor
- */
+/// @brief Default sum functor
 struct Sum
 {
-    /// Binary sum operator, returns <tt>a + b</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return a + b;
-    }
+  /// Binary sum operator, returns `t + u`
+  template <typename T, typename U>
+  __host__ __device__ __forceinline__ auto operator()(T &&t, U &&u) const
+    -> decltype(cuda::std::forward<T>(t) + cuda::std::forward<U>(u))
+  {
+    return cuda::std::forward<T>(t) + cuda::std::forward<U>(u);
+  }
 };
 
-/**
- * \brief Default difference functor
- */
+/// @brief Default difference functor
 struct Difference
 {
-  /// Binary difference operator, returns <tt>a - b</tt>
-  template <typename T>
-  __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+  /// Binary difference operator, returns `t - u`
+  template <typename T, typename U>
+  __host__ __device__ __forceinline__ auto operator()(T &&t, U &&u) const
+    -> decltype(cuda::std::forward<T>(t) - cuda::std::forward<U>(u))
   {
-    return a - b;
+    return cuda::std::forward<T>(t) - cuda::std::forward<U>(u);
   }
 };
 
-/**
- * \brief Default division functor
- */
+/// @brief Default division functor
 struct Division
 {
-  /// Binary difference operator, returns <tt>a - b</tt>
-  template <typename T>
-  __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+  /// Binary division operator, returns `t / u`
+  template <typename T, typename U>
+  __host__ __device__ __forceinline__ auto operator()(T &&t, U &&u) const
+    -> decltype(cuda::std::forward<T>(t) / cuda::std::forward<U>(u))
   {
-    return a / b;
+    return cuda::std::forward<T>(t) / cuda::std::forward<U>(u);
   }
 };
 
-
-/**
- * \brief Default max functor
- */
+/// @brief Default max functor
 struct Max
 {
-    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return CUB_MAX(a, b);
-    }
+  /// Boolean max operator, returns `(t > u) ? t : u`
+  template <typename T, typename U>
+  __host__ __device__ __forceinline__
+    typename cuda::std::common_type<T, U>::type
+    operator()(T &&t, U &&u) const
+  {
+    return CUB_MAX(t, u);
+  }
 };
 
-
-/**
- * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item)
- */
+/// @brief Arg max functor (keeps the value and offset of the first occurrence
+///        of the larger item)
 struct ArgMax
 {
-    /// Boolean max operator, preferring the item having the smaller offset in case of ties
-    template <typename T, typename OffsetT>
-    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
-        const KeyValuePair<OffsetT, T> &a,
-        const KeyValuePair<OffsetT, T> &b) const
-    {
-// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
-//        return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
+  /// Boolean max operator, preferring the item having the smaller offset in
+  /// case of ties
+  template <typename T, typename OffsetT>
+  __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T>
+  operator()(const KeyValuePair<OffsetT, T> &a,
+             const KeyValuePair<OffsetT, T> &b) const
+  {
+    // Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
+    // return ((b.value > a.value) || 
+    //         ((a.value == b.value) && (b.key < a.key))) 
+    //      ? b : a;
 
-        if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key)))
-            return b;
-        return a;
+    if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key)))
+    {
+      return b;
     }
-};
 
+    return a;
+  }
+};
 
-/**
- * \brief Default min functor
- */
+/// @brief Default min functor
 struct Min
 {
-    /// Boolean min operator, returns <tt>(a < b) ? a : b</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return CUB_MIN(a, b);
-    }
+  /// Boolean min operator, returns `(t < u) ? t : u`
+  template <typename T, typename U>
+  __host__ __device__ __forceinline__
+    typename cuda::std::common_type<T, U>::type
+    operator()(T &&t, U &&u) const
+  {
+    return CUB_MIN(t, u);
+  }
 };
 
-
-/**
- * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
- */
+/// @brief Arg min functor (keeps the value and offset of the first occurrence
+///        of the smallest item)
 struct ArgMin
 {
-    /// Boolean min operator, preferring the item having the smaller offset in case of ties
-    template <typename T, typename OffsetT>
-    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
-        const KeyValuePair<OffsetT, T> &a,
-        const KeyValuePair<OffsetT, T> &b) const
-    {
-// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
-//        return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
+  /// Boolean min operator, preferring the item having the smaller offset in
+  /// case of ties
+  template <typename T, typename OffsetT>
+  __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T>
+  operator()(const KeyValuePair<OffsetT, T> &a,
+             const KeyValuePair<OffsetT, T> &b) const
+  {
+    // Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
+    // return ((b.value < a.value) ||
+    //         ((a.value == b.value) && (b.key < a.key)))
+    //      ? b : a;
 
-        if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key)))
-            return b;
-        return a;
+    if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key)))
+    {
+      return b;
     }
-};
 
+    return a;
+  }
+};
 
-/**
- * \brief Default cast functor
- */
+/// @brief Default cast functor
 template <typename B>
 struct CastOp
 {
-    /// Cast operator, returns <tt>(B) a</tt>
-    template <typename A>
-    __host__ __device__ __forceinline__ B operator()(const A &a) const
-    {
-        return (B) a;
-    }
+  /// Cast operator, returns `(B) a`
+  template <typename A>
+  __host__ __device__ __forceinline__ B operator()(A &&a) const
+  {
+    return (B)a;
+  }
 };
 
-
-/**
- * \brief Binary operator wrapper for switching non-commutative scan arguments
- */
+/// @brief Binary operator wrapper for switching non-commutative scan arguments
 template <typename ScanOp>
 class SwizzleScanOp
 {
 private:
-
-    /// Wrapped scan operator
-    ScanOp scan_op;
+  /// Wrapped scan operator
+  ScanOp scan_op;
 
 public:
+  /// Constructor
+  __host__ __device__ __forceinline__ SwizzleScanOp(ScanOp scan_op)
+      : scan_op(scan_op)
+  {}
 
-    /// Constructor
-    __host__ __device__ __forceinline__
-    SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {}
-
-    /// Switch the scan arguments
-    template <typename T>
-    __host__ __device__ __forceinline__
-    T operator()(const T &a, const T &b)
-    {
-      T _a(a);
-      T _b(b);
+  /// Switch the scan arguments
+  template <typename T>
+  __host__ __device__ __forceinline__ T operator()(const T &a, const T &b)
+  {
+    T _a(a);
+    T _b(b);
 
-      return scan_op(_b, _a);
-    }
+    return scan_op(_b, _a);
+  }
 };
 
-
 /**
- * \brief Reduce-by-segment functor.
+ * @brief Reduce-by-segment functor.
  *
- * Given two cub::KeyValuePair inputs \p a and \p b and a
- * binary associative combining operator \p <tt>f(const T &x, const T &y)</tt>,
- * an instance of this functor returns a cub::KeyValuePair whose \p key
- * field is <tt>a.key</tt> + <tt>b.key</tt>, and whose \p value field
- * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise.
+ * Given two cub::KeyValuePair inputs `a` and `b` and a binary associative 
+ * combining operator `f(const T &x, const T &y)`, an instance of this functor 
+ * returns a cub::KeyValuePair whose `key` field is `a.key + b.key`, and whose 
+ * `value` field is either `b.value` if `b.key` is non-zero, or 
+ * `f(a.value, b.value)` otherwise.
  *
- * ReduceBySegmentOp is an associative, non-commutative binary combining operator
- * for input sequences of cub::KeyValuePair pairings.  Such
- * sequences are typically used to represent a segmented set of values to be reduced
+ * ReduceBySegmentOp is an associative, non-commutative binary combining 
+ * operator for input sequences of cub::KeyValuePair pairings. Such sequences 
+ * are typically used to represent a segmented set of values to be reduced
  * and a corresponding set of {0,1}-valued integer "head flags" demarcating the
  * first value of each segment.
  *
+ * @tparam ReductionOpT Binary reduction operator to apply to values
  */
-template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
+template <typename ReductionOpT>
 struct ReduceBySegmentOp
 {
-    /// Wrapped reduction operator
-    ReductionOpT op;
+  /// Wrapped reduction operator
+  ReductionOpT op;
 
-    /// Constructor
-    __host__ __device__ __forceinline__ ReduceBySegmentOp() {}
+  /// Constructor
+  __host__ __device__ __forceinline__ ReduceBySegmentOp() {}
 
-    /// Constructor
-    __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {}
+  /// Constructor
+  __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op)
+      : op(op)
+  {}
 
-    /// Scan operator
-    template <typename KeyValuePairT>       ///< KeyValuePair pairing of T (value) and OffsetT (head flag)
-    __host__ __device__ __forceinline__ KeyValuePairT operator()(
-        const KeyValuePairT &first,         ///< First partial reduction
-        const KeyValuePairT &second)        ///< Second partial reduction
-    {
-        KeyValuePairT retval;
-        retval.key = first.key + second.key;
+  /**
+   * @brief Scan operator
+   *
+   * @tparam KeyValuePairT
+   *   KeyValuePair pairing of T (value) and OffsetT (head flag)
+   *
+   * @param[in] first
+   *   First partial reduction
+   *
+   * @param[in] second
+   *   Second partial reduction
+   */
+  template <typename KeyValuePairT>
+  __host__ __device__ __forceinline__ KeyValuePairT
+  operator()(const KeyValuePairT &first, const KeyValuePairT &second)
+  {
+    KeyValuePairT retval;
+    retval.key = first.key + second.key;
 #ifdef _NVHPC_CUDA // WAR bug on nvc++
-        if (second.key)
-        {
-          retval.value = second.value;
-        }
-        else
-        {
-          // If second.value isn't copied into a temporary here, nvc++ will
-          // crash while compiling the TestScanByKeyWithLargeTypes test in
-          // thrust/testing/scan_by_key.cu:
-          auto v2 = second.value;
-          retval.value = op(first.value, v2);
-        }
+    if (second.key)
+    {
+      retval.value = second.value;
+    }
+    else
+    {
+      // If second.value isn't copied into a temporary here, nvc++ will
+      // crash while compiling the TestScanByKeyWithLargeTypes test in
+      // thrust/testing/scan_by_key.cu:
+      auto v2      = second.value;
+      retval.value = op(first.value, v2);
+    }
 #else // not nvc++:
-        retval.value = (second.key) ?
-                second.value :                          // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate
-                op(first.value, second.value);          // The second partial reduction does not span a reset, so accumulate both into the running aggregate
+    // if (second.key) {
+    //   The second partial reduction spans a segment reset, so it's value
+    //   aggregate becomes the running aggregate
+    // else {
+    //   The second partial reduction does not span a reset, so accumulate both
+    //   into the running aggregate
+    // } 
+    retval.value = (second.key) ? second.value : op(first.value, second.value);
 #endif
-        return retval;
-    }
+    return retval;
+  }
 };
 
-
-template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
+/**
+ * @tparam ReductionOpT Binary reduction operator to apply to values
+ */
+template <typename ReductionOpT>
 struct ReduceByKeyOp
 {
-    /// Wrapped reduction operator
-    ReductionOpT op;
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ReduceByKeyOp() {}
+  /// Wrapped reduction operator
+  ReductionOpT op;
 
-    /// Constructor
-    __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {}
+  /// Constructor
+  __host__ __device__ __forceinline__ ReduceByKeyOp() {}
 
-    /// Scan operator
-    template <typename KeyValuePairT>
-    __host__ __device__ __forceinline__ KeyValuePairT operator()(
-        const KeyValuePairT &first,       ///< First partial reduction
-        const KeyValuePairT &second)      ///< Second partial reduction
-    {
-        KeyValuePairT retval = second;
+  /// Constructor
+  __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op)
+      : op(op)
+  {}
 
-        if (first.key == second.key)
-            retval.value = op(first.value, retval.value);
+  /**
+   * @brief Scan operator
+   *
+   * @param[in] first First partial reduction
+   * @param[in] second Second partial reduction
+   */
+  template <typename KeyValuePairT>
+  __host__ __device__ __forceinline__ KeyValuePairT
+  operator()(const KeyValuePairT &first, const KeyValuePairT &second)
+  {
+    KeyValuePairT retval = second;
 
-        return retval;
+    if (first.key == second.key)
+    {
+      retval.value = op(first.value, retval.value);
     }
-};
 
+    return retval;
+  }
+};
 
 template <typename BinaryOpT>
 struct BinaryFlip
diff --git a/cub/thread/thread_reduce.cuh b/cub/thread/thread_reduce.cuh
index 98fb2faab3..9e4fb79d45 100644
--- a/cub/thread/thread_reduce.cuh
+++ b/cub/thread/thread_reduce.cuh
@@ -34,6 +34,7 @@
 #pragma once
 
 #include "../thread/thread_operators.cuh"
+#include "../detail/type_traits.cuh"
 #include "../config.cuh"
 
 CUB_NAMESPACE_BEGIN
@@ -47,14 +48,16 @@ namespace internal {
 template <
     int         LENGTH,
     typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
+    typename    ReductionOp,
+    typename    PrefixT,
+    typename    AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>> 
+__device__ __forceinline__ AccumT ThreadReduce(
     T*                  input,                  ///< [in] Input array
     ReductionOp         reduction_op,           ///< [in] Binary reduction operator
-    T                   prefix,                 ///< [in] Prefix to seed reduction with
+    PrefixT             prefix,                 ///< [in] Prefix to seed reduction with
     Int2Type<LENGTH>    /*length*/)
 {
-    T retval = prefix;
+    AccumT retval = prefix;
 
     #pragma unroll
     for (int i = 0; i < LENGTH; ++i)
@@ -74,11 +77,13 @@ __device__ __forceinline__ T ThreadReduce(
 template <
     int         LENGTH,
     typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
+    typename    ReductionOp,
+    typename    PrefixT,
+    typename    AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>> 
+__device__ __forceinline__ AccumT ThreadReduce(
     T*          input,                  ///< [in] Input array
     ReductionOp reduction_op,           ///< [in] Binary reduction operator
-    T           prefix)                 ///< [in] Prefix to seed reduction with
+    PrefixT     prefix)                 ///< [in] Prefix to seed reduction with
 {
     return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
 }
@@ -114,11 +119,13 @@ __device__ __forceinline__ T ThreadReduce(
 template <
     int         LENGTH,
     typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
+    typename    ReductionOp,
+    typename    PrefixT,
+    typename    AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>> 
+__device__ __forceinline__ AccumT ThreadReduce(
     T           (&input)[LENGTH],       ///< [in] Input array
     ReductionOp reduction_op,           ///< [in] Binary reduction operator
-    T           prefix)                 ///< [in] Prefix to seed reduction with
+    PrefixT     prefix)                 ///< [in] Prefix to seed reduction with
 {
     return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
 }
diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu
index 0599add52c..a8158b35a8 100644
--- a/test/test_device_reduce.cu
+++ b/test/test_device_reduce.cu
@@ -78,8 +78,9 @@ enum Backend
 struct CustomMax
 {
     /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
-    template <typename OutputT>
-    __host__ __device__ __forceinline__ OutputT operator()(const OutputT &a, const OutputT &b)
+    template <typename T, typename C>
+    __host__ __device__ auto operator()(T&& a, C&& b) 
+      -> cub::detail::accumulator_t<cub::Max, T, C>
     {
         return CUB_MAX(a, b);
     }
@@ -636,7 +637,9 @@ void Initialize(
 template <typename ReductionOpT, typename InputT, typename _OutputT>
 struct Solution
 {
-    typedef _OutputT OutputT;
+    using OutputT = _OutputT;
+    using InitT = OutputT;
+    using AccumT = cub::detail::accumulator_t<ReductionOpT, InitT, InputT>;
 
     template <typename HostInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
     static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments,
@@ -644,7 +647,7 @@ struct Solution
     {
         for (int i = 0; i < num_segments; ++i)
         {
-            OutputT aggregate = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
+            AccumT aggregate = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
             for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j)
                 aggregate = reduction_op(aggregate, OutputT(h_in[j]));
             h_reference[i] = aggregate;
@@ -656,7 +659,9 @@ struct Solution
 template <typename InputT, typename _OutputT>
 struct Solution<cub::Min, InputT, _OutputT>
 {
-    typedef _OutputT OutputT;
+    using OutputT = _OutputT;
+    using InitT = OutputT;
+    using AccumT = cub::detail::accumulator_t<cub::Min, InitT, InputT>;
 
     template <typename HostInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
     static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments,
@@ -664,7 +669,7 @@ struct Solution<cub::Min, InputT, _OutputT>
     {
         for (int i = 0; i < num_segments; ++i)
         {
-            OutputT aggregate = Traits<InputT>::Max();    // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
+            AccumT aggregate = Traits<InputT>::Max();    // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
             for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j)
                 aggregate = reduction_op(aggregate, OutputT(h_in[j]));
             h_reference[i] = aggregate;
@@ -677,7 +682,9 @@ struct Solution<cub::Min, InputT, _OutputT>
 template <typename InputT, typename _OutputT>
 struct Solution<cub::Sum, InputT, _OutputT>
 {
-    typedef _OutputT OutputT;
+    using OutputT = _OutputT;
+    using InitT = OutputT;
+    using AccumT = cub::detail::accumulator_t<cub::Sum, InitT, InputT>;
 
     template <typename HostInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
     static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments,
@@ -685,10 +692,10 @@ struct Solution<cub::Sum, InputT, _OutputT>
     {
         for (int i = 0; i < num_segments; ++i)
         {
-            OutputT aggregate;
+            AccumT aggregate;
             InitValue(INTEGER_SEED, aggregate, 0);
             for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j)
-                aggregate = reduction_op(aggregate, OutputT(h_in[j]));
+                aggregate = reduction_op(aggregate, h_in[j]);
             h_reference[i] = aggregate;
         }
     }
@@ -795,6 +802,7 @@ void Test(
 
     // Check for correctness (and display results, if specified)
     int compare = CompareDeviceResults(h_reference, d_out, num_segments, g_verbose, g_verbose);
+
     printf("\t%s", compare ? "FAIL" : "PASS");
 
     // Flush any stdout/stderr
@@ -1182,13 +1190,173 @@ void TestType(
     OffsetT     max_items,
     OffsetT     max_segments)
 {
-    typedef typename DeviceReducePolicy<InputT, OutputT, OffsetT, cub::Sum>::MaxPolicy MaxPolicyT;
+    typedef typename DeviceReducePolicy<InputT, OffsetT, cub::Sum>::MaxPolicy MaxPolicyT;
 
     TestBySize<InputT, OutputT, OffsetT> dispatch(max_items, max_segments);
 
     MaxPolicyT::Invoke(g_ptx_version, dispatch);
 }
 
+class CustomInputT
+{
+  char m_val{};
+
+public:
+  __host__ __device__ explicit CustomInputT(char val)
+      : m_val(val)
+  {}
+
+  __host__ __device__ int get() const { return static_cast<int>(m_val); }
+};
+
+class CustomAccumulatorT
+{
+  int m_val{0};
+  int m_magic_value{42};
+
+  __host__ __device__ CustomAccumulatorT(int val)
+      : m_val(val)
+  {}
+
+public:
+  __host__ __device__ CustomAccumulatorT()
+  {}
+
+  __host__ __device__ CustomAccumulatorT(const CustomAccumulatorT &in)
+    : m_val(in.is_valid() * in.get())
+    , m_magic_value(in.is_valid() * 42)
+  {}
+
+  __host__ __device__ void operator=(const CustomInputT &in)
+  {
+    if (this->is_valid())
+    {
+      m_val = in.get();
+    }
+  }
+
+  __host__ __device__ void operator=(const CustomAccumulatorT &in)
+  {
+    if (this->is_valid() && in.is_valid())
+    {
+      m_val = in.get();
+    }
+  }
+
+  __host__ __device__ CustomAccumulatorT 
+  operator+(const CustomInputT &in) const
+  {
+    const int multiplier = this->is_valid();
+    return {(m_val + in.get()) * multiplier};
+  }
+
+  __host__ __device__ CustomAccumulatorT
+  operator+(const CustomAccumulatorT &in) const
+  {
+    const int multiplier = this->is_valid() && in.is_valid();
+    return {(m_val + in.get()) * multiplier};
+  }
+
+  __host__ __device__ int get() const { return m_val; }
+
+  __host__ __device__ bool is_valid() const { return m_magic_value == 42; }
+};
+
+class CustomOutputT
+{
+  bool *m_d_flag{};
+  int m_expected{};
+
+public:
+  __host__ __device__ CustomOutputT(bool *d_flag, int expected)
+      : m_d_flag(d_flag)
+      , m_expected(expected)
+  {}
+
+  __host__ __device__ void operator=(const CustomAccumulatorT &accum) const
+  {
+    *m_d_flag = accum.is_valid() && (accum.get() == m_expected);
+  }
+};
+
+__global__ void InitializeTestAccumulatorTypes(int num_items,
+                                               int expected,
+                                               bool *d_flag,
+                                               CustomInputT *d_in,
+                                               CustomOutputT *d_out)
+{
+  const int idx = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
+
+  if (idx < num_items)
+  {
+    d_in[idx] = CustomInputT(1);
+  }
+
+  if (idx == 0)
+  {
+    *d_out = CustomOutputT{d_flag, expected};
+  }
+}
+
+void TestAccumulatorTypes()
+{
+  const int num_items  = 2 * 1024 * 1024;
+  const int expected   = num_items;
+  const int block_size = 256;
+  const int grid_size  = (num_items + block_size - 1) / block_size;
+
+  CustomInputT *d_in{};
+  CustomOutputT *d_out{};
+  CustomAccumulatorT init{};
+  bool *d_flag{};
+
+  CubDebugExit(
+    g_allocator.DeviceAllocate((void **)&d_out, sizeof(CustomOutputT)));
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_flag, sizeof(bool)));
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_in,
+                                          sizeof(CustomInputT) * num_items));
+
+  InitializeTestAccumulatorTypes<<<grid_size, block_size>>>(num_items,
+                                                            expected,
+                                                            d_flag,
+                                                            d_in,
+                                                            d_out);
+
+  std::uint8_t *d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+
+  CubDebugExit(cub::DeviceReduce::Reduce(d_temp_storage,
+                                         temp_storage_bytes,
+                                         d_in,
+                                         d_out,
+                                         num_items,
+                                         cub::Sum{},
+                                         init,
+                                         0,
+                                         true));
+
+  CubDebugExit(
+    g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes));
+  CubDebugExit(cudaMemset(d_temp_storage, 1,  temp_storage_bytes));
+
+  CubDebugExit(cub::DeviceReduce::Reduce(d_temp_storage,
+                                         temp_storage_bytes,
+                                         d_in,
+                                         d_out,
+                                         num_items,
+                                         cub::Sum{},
+                                         init,
+                                         0,
+                                         true));
+
+  bool ok{};
+  CubDebugExit(cudaMemcpy(&ok, d_flag, sizeof(bool), cudaMemcpyDeviceToHost));
+
+  AssertTrue(ok);
+
+  CubDebugExit(g_allocator.DeviceFree(d_out));
+  CubDebugExit(g_allocator.DeviceFree(d_in));
+}
 
 //---------------------------------------------------------------------
 // Main
@@ -1256,6 +1424,8 @@ int main(int argc, char** argv)
 #else // TEST_TYPES == 3
     TestType<TestFoo, TestFoo>(max_items, max_segments);
     TestType<TestBar, TestBar>(max_items, max_segments);
+
+    TestAccumulatorTypes();
 #endif
     printf("\n");
     return 0;
diff --git a/test/test_thread_operators.cu b/test/test_thread_operators.cu
new file mode 100644
index 0000000000..44cc3a81d6
--- /dev/null
+++ b/test/test_thread_operators.cu
@@ -0,0 +1,259 @@
+/*******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "test_util.h"
+
+#include <cub/thread/thread_operators.cuh>
+
+template <class T>
+T Make(int val)
+{
+  return T{val};
+}
+
+template <bool>
+class BaseT
+{
+protected:
+  int m_val{};
+
+public:
+  BaseT(int val)
+      : m_val{val}
+  {}
+};
+
+template <>
+class BaseT<true>
+{
+protected:
+  int m_val{};
+
+public:
+  BaseT(int val)
+      : m_val{val}
+  {}
+
+  __host__ __device__ operator int() const { return m_val; }
+};
+
+#define CUSTOM_TYPE_FACTORY(NAME, RT, OP, CONVERTABLE)                         \
+  class Custom##NAME##T : public BaseT<CONVERTABLE>                            \
+  {                                                                            \
+    explicit Custom##NAME##T(int val)                                          \
+        : BaseT<CONVERTABLE>(val)                                              \
+    {}                                                                         \
+                                                                               \
+    friend Custom##NAME##T Make<Custom##NAME##T>(int);                         \
+                                                                               \
+  public:                                                                      \
+    __host__ __device__ RT operator OP(int val) const                          \
+    {                                                                          \
+      return m_val OP val;                                                     \
+    }                                                                          \
+  }
+
+//                  NAME  RT    OP  CONVERTABLE
+CUSTOM_TYPE_FACTORY(Eq,   bool, ==, false);
+CUSTOM_TYPE_FACTORY(Ineq, bool, !=, false);
+CUSTOM_TYPE_FACTORY(Sum,  int,  +,  false);
+CUSTOM_TYPE_FACTORY(Diff, int,  -,  false);
+CUSTOM_TYPE_FACTORY(Div,  int,  /,  false);
+CUSTOM_TYPE_FACTORY(Gt,   bool, >,  true);
+CUSTOM_TYPE_FACTORY(Lt,   bool, <,  true);
+
+void TestEquality()
+{
+  cub::Equality op{}; 
+
+  const int const_magic_val = 42;
+  int magic_val = const_magic_val;
+
+  AssertEquals(op(const_magic_val, const_magic_val), true);
+  AssertEquals(op(const_magic_val, magic_val), true);
+  AssertEquals(op(const_magic_val, magic_val + 1), false);
+
+  AssertEquals(op(Make<CustomEqT>(magic_val), magic_val), true);
+  AssertEquals(op(Make<CustomEqT>(magic_val), magic_val + 1), false);
+}
+
+void TestInequality()
+{
+  cub::Inequality op{}; 
+
+  const int const_magic_val = 42;
+  int magic_val = const_magic_val;
+
+  AssertEquals(op(const_magic_val, const_magic_val), false);
+  AssertEquals(op(const_magic_val, magic_val), false);
+  AssertEquals(op(const_magic_val, magic_val + 1), true);
+
+  AssertEquals(op(Make<CustomIneqT>(magic_val), magic_val), false);
+  AssertEquals(op(Make<CustomIneqT>(magic_val), magic_val + 1), true);
+}
+
+void TestInequalityWrapper()
+{
+  cub::Equality wrapped_op{}; 
+  cub::InequalityWrapper<cub::Equality> op{wrapped_op};
+
+  const int const_magic_val = 42;
+  int magic_val = const_magic_val;
+
+  AssertEquals(op(const_magic_val, const_magic_val), false);
+  AssertEquals(op(const_magic_val, magic_val), false);
+  AssertEquals(op(const_magic_val, magic_val + 1), true);
+
+  AssertEquals(op(Make<CustomEqT>(magic_val), magic_val), false);
+  AssertEquals(op(Make<CustomEqT>(magic_val), magic_val + 1), true);
+}
+
+#define CUSTOM_SYNC_T(NAME, RT, OP)                                            \
+  struct Custom ## NAME ## Sink                                                \
+  {                                                                            \
+    template <class T>                                                         \
+    __host__ __device__ RT operator OP (T &&) const                            \
+    {                                                                          \
+      return RT{};                                                             \
+    }                                                                          \
+  }
+
+CUSTOM_SYNC_T(SumInt, int, +);
+CUSTOM_SYNC_T(SumCustomInt, CustomSumIntSink, +);
+
+CUSTOM_SYNC_T(DiffInt, int, -);
+CUSTOM_SYNC_T(DiffCustomInt, CustomDiffIntSink, -);
+
+CUSTOM_SYNC_T(DivInt, int, /);
+CUSTOM_SYNC_T(DivCustomInt, CustomDivIntSink, /);
+
+template <class ExpectedT, class ActualT>
+void StaticSame()
+{
+  static_assert(std::is_same<ExpectedT, ActualT>::value, "shall match");
+}
+
+void TestSum()
+{
+  cub::Sum op{};
+
+  const int const_magic_val = 40;
+  int magic_val = const_magic_val;
+
+  AssertEquals(op(const_magic_val, 2), 42);
+  AssertEquals(op(magic_val, 2), 42);
+  AssertEquals(op(Make<CustomSumT>(magic_val), 2), 42);
+
+  StaticSame<decltype(op(42, 42)), int>();
+  StaticSame<decltype(op(1, 1.0)), double>();
+  StaticSame<decltype(op(CustomSumIntSink{}, 1.0)), int>();
+  StaticSame<decltype(op(CustomSumCustomIntSink{}, 1.0)), CustomSumIntSink>();
+}
+
+void TestDifference()
+{
+  cub::Difference op{};
+
+  const int const_magic_val = 44;
+  int magic_val = const_magic_val;
+
+  AssertEquals(op(const_magic_val, 2), 42);
+  AssertEquals(op(magic_val, 2), 42);
+
+  AssertEquals(op(Make<CustomDiffT>(magic_val), 2), 42);
+
+  StaticSame<decltype(op(42, 42)), int>();
+  StaticSame<decltype(op(1, 1.0)), double>();
+  StaticSame<decltype(op(CustomDiffIntSink{}, 1.0)), int>();
+  StaticSame<decltype(op(CustomDiffCustomIntSink{}, 1.0)), CustomDiffIntSink>();
+}
+
+void TestDivision()
+{
+  cub::Division op{};
+
+  const int const_magic_val = 44;
+  int magic_val = const_magic_val;
+
+  AssertEquals(op(const_magic_val, 2), 22);
+  AssertEquals(op(magic_val, 2), 22);
+
+  AssertEquals(op(Make<CustomDivT>(magic_val), 2), 22);
+
+  StaticSame<decltype(op(42, 42)), int>();
+  StaticSame<decltype(op(1, 1.0)), double>();
+  StaticSame<decltype(op(CustomDivIntSink{}, 1.0)), int>();
+  StaticSame<decltype(op(CustomDivCustomIntSink{}, 1.0)), CustomDivIntSink>();
+}
+
+void TestMax()
+{
+  cub::Max op{};
+
+  const int const_magic_val = 42;
+  int magic_val = const_magic_val;
+
+  AssertEquals(op(const_magic_val, 2), 42);
+  AssertEquals(op(magic_val, 2), 42);
+
+  AssertEquals(op(2, Make<CustomGtT>(magic_val)), 42);
+
+  StaticSame<decltype(op(42, 42)), int>();
+  StaticSame<decltype(op(1, 1.0)), double>();
+  StaticSame<decltype(op(1, Make<CustomGtT>(magic_val))), int>();
+}
+
+void TestMin()
+{
+  cub::Min op{};
+
+  const int const_magic_val = 42;
+  int magic_val = const_magic_val;
+
+  AssertEquals(op(const_magic_val, 2), 2);
+  AssertEquals(op(magic_val, 2), 2);
+
+  AssertEquals(op(2, Make<CustomLtT>(magic_val)), 2);
+
+  StaticSame<decltype(op(42, 42)), int>();
+  StaticSame<decltype(op(1, 1.0)), double>();
+  StaticSame<decltype(op(1, Make<CustomLtT>(magic_val))), int>();
+}
+
+int main()
+{
+  TestEquality();
+  TestInequality();
+  TestInequalityWrapper();
+  TestSum();
+  TestDifference();
+  TestDivision();
+  TestMax();
+  TestMin();
+
+  return 0;
+}