From b00f714a11df33b5f24bc2ad14aaebda71446413 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Tue, 17 May 2022 17:47:08 +0400
Subject: [PATCH] In-place guarantees for scan algorithms

---
 cub/agent/agent_scan_by_key.cuh              |   37 +-
 cub/device/device_scan.cuh                   | 2765 ++++++++++++------
 cub/device/dispatch/dispatch_scan.cuh        |   10 +-
 cub/device/dispatch/dispatch_scan_by_key.cuh |   42 +-
 test/test_device_scan.cu                     |  671 +++--
 test/test_device_scan_by_key.cu              |  247 +-
 6 files changed, 2624 insertions(+), 1148 deletions(-)
diff --git a/cub/agent/agent_scan_by_key.cuh b/cub/agent/agent_scan_by_key.cuh
index 68575a6370..8e25c48af4 100644
--- a/cub/agent/agent_scan_by_key.cuh
+++ b/cub/agent/agent_scan_by_key.cuh
@@ -158,6 +158,7 @@ struct AgentScanByKey
 
     TempStorage &                 storage;
     WrappedKeysInputIteratorT     d_keys_in;
+    KeyT*                         d_keys_prev_in;
     WrappedValuesInputIteratorT   d_values_in;
     ValuesOutputIteratorT         d_values_out;
     InequalityWrapper<EqualityOp> inequality_op;
@@ -364,19 +365,27 @@ struct AgentScanByKey
         }
         else
         {
-            KeyT tile_pred_key = (threadIdx.x == 0) ? d_keys_in[tile_base - 1] : KeyT();
-            BlockDiscontinuityKeysT(storage.scan_storage.discontinuity)
-                .FlagHeads(segment_flags, keys, inequality_op, tile_pred_key);
-
-            // Zip values and segment_flags
-            ZipValuesAndFlags<IS_LAST_TILE>(num_remaining,
-                                            values,
-                                            segment_flags,
-                                            scan_items);
-
-            SizeValuePairT  tile_aggregate;
-            TilePrefixCallbackT prefix_op(tile_state, storage.scan_storage.prefix, pair_scan_op, tile_idx);
-            ScanTile(scan_items, tile_aggregate, prefix_op, Int2Type<IS_INCLUSIVE>());
+          KeyT tile_pred_key = (threadIdx.x == 0) ? d_keys_prev_in[tile_idx]
+                                                  : KeyT();
+
+          BlockDiscontinuityKeysT(storage.scan_storage.discontinuity)
+            .FlagHeads(segment_flags, keys, inequality_op, tile_pred_key);
+
+          // Zip values and segment_flags
+          ZipValuesAndFlags<IS_LAST_TILE>(num_remaining,
+                                          values,
+                                          segment_flags,
+                                          scan_items);
+
+          SizeValuePairT tile_aggregate;
+          TilePrefixCallbackT prefix_op(tile_state,
+                                        storage.scan_storage.prefix,
+                                        pair_scan_op,
+                                        tile_idx);
+          ScanTile(scan_items,
+                   tile_aggregate,
+                   prefix_op,
+                   Int2Type<IS_INCLUSIVE>());
         }
 
         CTA_SYNC();
@@ -408,6 +417,7 @@ struct AgentScanByKey
     AgentScanByKey(
         TempStorage &         storage,
         KeysInputIteratorT    d_keys_in,
+        KeyT *                d_keys_prev_in,
         ValuesInputIteratorT  d_values_in,
         ValuesOutputIteratorT d_values_out,
         EqualityOp            equality_op,
@@ -416,6 +426,7 @@ struct AgentScanByKey
     : 
         storage(storage),
         d_keys_in(d_keys_in),
+        d_keys_prev_in(d_keys_prev_in),
         d_values_in(d_values_in),
         d_values_out(d_values_out),
         inequality_op(equality_op),
diff --git a/cub/device/device_scan.cuh b/cub/device/device_scan.cuh
index 5d63d96244..c688515ff7 100644
--- a/cub/device/device_scan.cuh
+++ b/cub/device/device_scan.cuh
@@ -1,7 +1,6 @@
-
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -14,10 +13,10 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
@@ -28,844 +27,1974 @@
  ******************************************************************************/
 
 /**
- * \file
- * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
+ * @file cub::DeviceScan provides device-wide, parallel operations for 
+ *       computing a prefix scan across a sequence of data items residing 
+ *       within device-accessible memory.
  */
 
 #pragma once
 
-#include <stdio.h>
-#include <iterator>
+#include <cub/config.cuh>
+#include <cub/device/dispatch/dispatch_scan.cuh>
+#include <cub/device/dispatch/dispatch_scan_by_key.cuh>
+#include <cub/thread/thread_operators.cuh>
 
-#include "../config.cuh"
-#include "../thread/thread_operators.cuh"
-#include "dispatch/dispatch_scan.cuh"
-#include "dispatch/dispatch_scan_by_key.cuh"
 
 CUB_NAMESPACE_BEGIN
 
 
 /**
- * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. ![](device_scan.png)
- * \ingroup SingleModule
+ * @brief DeviceScan provides device-wide, parallel operations for computing a 
+ *   prefix scan across a sequence of data items residing within 
+ *   device-accessible memory. ![](device_scan.png)
+ *
+ * @ingroup SingleModule
  *
- * \par Overview
- * Given a sequence of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
- * produces an output sequence where each element is computed to be the reduction
- * of the elements occurring earlier in the input sequence.  <em>Prefix sum</em>
- * connotes a prefix scan with the addition operator. The term \em inclusive indicates
- * that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
- * The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
- * the <em>i</em><sup>th</sup> output reduction.
+ * @par Overview
+ * Given a sequence of input elements and a binary reduction operator, a 
+ * [*prefix scan*](http://en.wikipedia.org/wiki/Prefix_sum) produces an output 
+ * sequence where each element is computed to be the reduction of the elements 
+ * occurring earlier in the input sequence. *Prefix sum* connotes a prefix scan 
+ * with the addition operator. The term *inclusive* indicates that the 
+ * *i*<sup>th</sup> output reduction incorporates the *i*<sup>th</sup> input.
+ * The term *exclusive* indicates the *i*<sup>th</sup> input is not 
+ * incorporated into the *i*<sup>th</sup> output reduction. When the input and 
+ * output sequences are the same, the scan is performed in-place.
  *
- * \par
- * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our <em>"decoupled look-back"</em> algorithm
- * for performing global prefix scan with only a single pass through the
- * input data, as described in our 2016 technical report [1].  The central
- * idea is to leverage a small, constant factor of redundant work in order to overlap the latencies
- * of global prefix propagation with local computation.  As such, our algorithm requires only
- * ~2<em>n</em> data movement (<em>n</em> inputs are read, <em>n</em> outputs are written), and typically
- * proceeds at "memcpy" speeds. Our algorithm supports inplace operations.
+ * @par
+ * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our 
+ * *"decoupled look-back"* algorithm for performing global prefix scan with 
+ * only a single pass through the input data, as described in our 2016 technical 
+ * report [1]. The central idea is to leverage a small, constant factor of 
+ * redundant work in order to overlap the latencies of global prefix 
+ * propagation with local computation. As such, our algorithm requires only
+ * ~2*n* data movement (*n* inputs are read, *n* outputs are written), and 
+ * typically proceeds at "memcpy" speeds. Our algorithm supports inplace 
+ * operations.
  *
- * \par
+ * @par
  * [1] [Duane Merrill and Michael Garland.  "Single-pass Parallel Prefix Scan with Decoupled Look-back", <em>NVIDIA Technical Report NVR-2016-002</em>, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back)
  *
- * \par Usage Considerations
- * \cdp_class{DeviceScan}
+ * @par Usage Considerations
+ * @cdp_class{DeviceScan}
  *
- * \par Performance
- * \linear_performance{prefix scan}
+ * @par Performance
+ * @linear_performance{prefix scan}
  *
- * \par
- * The following chart illustrates DeviceScan::ExclusiveSum
- * performance across different CUDA architectures for \p int32 keys.
- * \plots_below
+ * @par
+ * The following chart illustrates DeviceScan::ExclusiveSum performance across 
+ * different CUDA architectures for `int32` keys.
+ * @plots_below
  *
- * \image html scan_int32.png
+ * @image html scan_int32.png
  *
  */
 struct DeviceScan
 {
-    /******************************************************************//**
-     * \name Exclusive scans
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes a device-wide exclusive prefix sum.  The value of 0 is applied as the initial value, and is assigned to *d_out.
-     *
-     * \par
-     * - Supports non-commutative sum operators.
-     * - Results are not deterministic for pseudo-associative operators (e.g.,
-     *   addition of floating-point types). Results for pseudo-associative
-     *   operators may vary from run to run. Additional details can be found in
-     *   the [decoupled look-back] description.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated exclusive sum performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.
-     *
-     * \image html scan_int32.png
-     * \image html scan_int64.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the exclusive prefix sum of an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run exclusive prefix sum
-     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // d_out s<-- [0, 8, 14, 21, 26, 29, 29]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
-     *
-     * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-     */
-    template <
-        typename        InputIteratorT,
-        typename        OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ExclusiveSum(
-        void            *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                               ///< [in] Random-access iterator to the input sequence of data items
-        OutputIteratorT d_out,                              ///< [out] Random-access iterator to the output sequence of data items
-        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        using OffsetT = int;
-
-        // The output value type -- used as the intermediate accumulator
-        // Use the input value type per https://wg21.link/P0571
-        using OutputT = cub::detail::value_t<InputIteratorT>;
-
-        // Initial value
-        OutputT init_value = 0;
-
-        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, detail::InputValue<OutputT>, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            Sum(),
-            detail::InputValue<OutputT>(init_value),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor.  The \p init_value value is applied as the initial value, and is assigned to *d_out.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - Results are not deterministic for pseudo-associative operators (e.g.,
-     *   addition of floating-point types). Results for pseudo-associative
-     *   operators may vary from run to run. Additional details can be found in
-     *   the [decoupled look-back] description.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     * #include <climits>       // for INT_MAX
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;      // e.g., 7
-     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * CustomMin    min_op;
-     * ...
-     *
-     * // Determine temporary device storage requirements for exclusive prefix scan
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) INT_MAX, num_items);
-     *
-     * // Allocate temporary storage for exclusive prefix scan
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run exclusive prefix min-scan
-     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) INT_MAX, num_items);
-     *
-     * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
-     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
-     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam InitValueT       <b>[inferred]</b> Type of the \p init_value used Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-     */
-    template <
-        typename        InputIteratorT,
-        typename        OutputIteratorT,
-        typename        ScanOpT,
-        typename        InitValueT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ExclusiveScan(
-        void            *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                               ///< [in] Random-access iterator to the input sequence of data items
-        OutputIteratorT d_out,                              ///< [out] Random-access iterator to the output sequence of data items
-        ScanOpT         scan_op,                            ///< [in] Binary scan functor
-        InitValueT      init_value,                         ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out)
-        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            scan_op,
-            detail::InputValue<InitValueT>(init_value),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-    template <
-        typename        InputIteratorT,
-        typename        OutputIteratorT,
-        typename        ScanOpT,
-        typename        InitValueT,
-        typename        InitValueIterT=InitValueT*>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ExclusiveScan(
-        void                                    *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                                  &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT                          d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT                         d_out,                              ///< [out] Pointer to the output sequence of data items
-        ScanOpT                                 scan_op,                            ///< [in] Binary scan functor
-        FutureValue<InitValueT, InitValueIterT> init_value,                         ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out)
-        int                                     num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t                            stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                                    debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            scan_op,
-            detail::InputValue<InitValueT>(init_value),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive scans
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a device-wide inclusive prefix sum.
-     *
-     * \par
-     * - Supports non-commutative sum operators.
-     * - Results are not deterministic for pseudo-associative operators (e.g.,
-     *   addition of floating-point types). Results for pseudo-associative
-     *   operators may vary from run to run. Additional details can be found in
-     *   the [decoupled look-back] description.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the inclusive prefix sum of an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * ...
-     *
-     * // Determine temporary device storage requirements for inclusive prefix sum
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // Allocate temporary storage for inclusive prefix sum
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run inclusive prefix sum
-     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // d_out <-- [8, 14, 21, 26, 29, 29, 38]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
-     *
-     * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t InclusiveSum(
-        void*               d_temp_storage,                 ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                           ///< [in] Random-access iterator to the input sequence of data items
-        OutputIteratorT     d_out,                          ///< [out] Random-access iterator to the output sequence of data items
-        int                 num_items,                      ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t        stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            Sum(),
-            NullType(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - Results are not deterministic for pseudo-associative operators (e.g.,
-     *   addition of floating-point types). Results for pseudo-associative
-     *   operators may vary from run to run. Additional details can be found in
-     *   the [decoupled look-back] description.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     * #include <climits>       // for INT_MAX
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;      // e.g., 7
-     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * CustomMin    min_op;
-     * ...
-     *
-     * // Determine temporary device storage requirements for inclusive prefix scan
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
-     *
-     * // Allocate temporary storage for inclusive prefix scan
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run inclusive prefix min-scan
-     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
-     *
-     * // d_out <-- [8, 6, 6, 5, 3, 0, 0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
-     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
-     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-     */
-    template <
-        typename        InputIteratorT,
-        typename        OutputIteratorT,
-        typename        ScanOpT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t InclusiveScan(
-        void            *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                               ///< [in] Random-access iterator to the input sequence of data items
-        OutputIteratorT d_out,                              ///< [out] Random-access iterator to the output sequence of data items
-        ScanOpT         scan_op,                            ///< [in] Binary scan functor
-        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t    stream             = 0,             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous  = false)         ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            scan_op,
-            NullType(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-    /**
-     * \brief Computes a device-wide exclusive prefix sum-by-key with key equality
-     * defined by \p equality_op .  The value of 0 is applied as the initial value,
-     * and is assigned to the beginning of each segment in \p d_values_out .
-     *
-     * \par
-     * - Supports non-commutative sum operators.
-     * - Results are not deterministic for pseudo-associative operators (e.g.,
-     *   addition of floating-point types). Results for pseudo-associative
-     *   operators may vary from run to run. Additional details can be found in
-     *   the [decoupled look-back] description.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the exclusive prefix sum-by-key of an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int num_items;      // e.g., 7
-     * int *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
-     * int *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceScan::ExclusiveSumByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run exclusive prefix sum
-     * cub::DeviceScan::ExclusiveSumByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items);
-     *
-     * // d_values_out <-- [0, 8, 0, 7, 12, 0, 0]
-     *
-     * \endcode
-     *
-     * \tparam KeysInputIteratorT      <b>[inferred]</b> Random-access input iterator type for reading scan keys inputs \iterator
-     * \tparam ValuesInputIteratorT    <b>[inferred]</b> Random-access input iterator type for reading scan values inputs \iterator
-     * \tparam ValuesOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing scan values outputs \iterator
-     * \tparam EqualityOpT             <b>[inferred]</b> Functor type having member <tt>T operator()(const T &a, const T &b)</tt> for binary operations that defines the equality of keys
-     *
-     * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-     */
-    template <
-        typename        KeysInputIteratorT,
-        typename        ValuesInputIteratorT,
-        typename        ValuesOutputIteratorT,
-        typename        EqualityOpT = Equality>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ExclusiveSumByKey(
-        void                  *d_temp_storage,              ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                &temp_storage_bytes,          ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        KeysInputIteratorT    d_keys_in,                    ///< [in] Random-access input iterator to the input sequence of key items
-        ValuesInputIteratorT  d_values_in,                  ///< [in] Random-access input iterator to the input sequence of value items
-        ValuesOutputIteratorT d_values_out,                 ///< [out] Random-access output iterator to the output sequence of value items
-        int                   num_items,                    ///< [in] Total number of input items (i.e., the length of \p d_keys_in and \p d_values_in)
-        EqualityOpT           equality_op = EqualityOpT(),  ///< [in] Binary functor that defines the equality of keys. Default is cub::Equality().
-        cudaStream_t          stream=0,                     ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                  debug_synchronous=false)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        using OffsetT = int;
-
-        // The output value type -- used as the intermediate accumulator
-        // Use the input value type per https://wg21.link/P0571
-        using OutputT = cub::detail::value_t<ValuesInputIteratorT>;
-
-        // Initial value
-        OutputT init_value = 0;
-
-        return DispatchScanByKey<
-            KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT, Sum, OutputT, OffsetT>
-        ::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys_in,
-            d_values_in,
-            d_values_out,
-            equality_op,
-            Sum(),
-            init_value,
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-    /**
-     * \brief Computes a device-wide exclusive prefix scan-by-key using the specified binary \p scan_op functor.
-     * The key equality is defined by \p equality_op .  The \p init_value value is applied as the initial value,
-     * and is assigned to the beginning of each segment in \p d_values_out .
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - Results are not deterministic for pseudo-associative operators (e.g.,
-     *   addition of floating-point types). Results for pseudo-associative
-     *   operators may vary from run to run. Additional details can be found in
-     *   the [decoupled look-back] description.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the exclusive prefix min-scan-by-key of an \p int device vector
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     * #include <climits>       // for INT_MAX
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // CustomEqual functor
-     * struct CustomEqual
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return a == b;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;      // e.g., 7
-     * int          *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
-     * int          *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * CustomMin    min_op;
-     * CustomEqual  equality_op;
-     * ...
-     *
-     * // Determine temporary device storage requirements for exclusive prefix scan
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceScan::ExclusiveScanByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, min_op, (int) INT_MAX, num_items, equality_op);
-     *
-     * // Allocate temporary storage for exclusive prefix scan
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run exclusive prefix min-scan
-     * cub::DeviceScan::ExclusiveScanByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, min_op, (int) INT_MAX, num_items, equality_op);
-     *
-     * // d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0]
-     *
-     * \endcode
-     *
-     * \tparam KeysInputIteratorT      <b>[inferred]</b> Random-access input iterator type for reading scan keys inputs \iterator
-     * \tparam ValuesInputIteratorT    <b>[inferred]</b> Random-access input iterator type for reading scan values inputs \iterator
-     * \tparam ValuesOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing scan values outputs \iterator
-     * \tparam ScanOp                  <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam InitValueT              <b>[inferred]</b> Type of the \p init_value value used in Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam EqualityOpT             <b>[inferred]</b> Functor type having member <tt>T operator()(const T &a, const T &b)</tt> for binary operations that defines the equality of keys
-     *
-     * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-     */
-    template <
-        typename        KeysInputIteratorT,
-        typename        ValuesInputIteratorT,
-        typename        ValuesOutputIteratorT,
-        typename        ScanOpT,
-        typename        InitValueT,
-        typename        EqualityOpT = Equality>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ExclusiveScanByKey(
-        void                  *d_temp_storage,              ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                &temp_storage_bytes,          ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        KeysInputIteratorT    d_keys_in,                    ///< [in] Random-access input iterator to the input sequence of key items
-        ValuesInputIteratorT  d_values_in,                  ///< [in] Random-access input iterator to the input sequence of value items
-        ValuesOutputIteratorT d_values_out,                 ///< [out] Random-access output iterator to the output sequence of value items
-        ScanOpT               scan_op,                      ///< [in] Binary scan functor
-        InitValueT            init_value,                   ///< [in] Initial value to seed the exclusive scan (and is assigned to the beginning of each segment in \p d_values_out)
-        int                   num_items,                    ///< [in] Total number of input items (i.e., the length of \p d_keys_in and \p d_values_in)
-        EqualityOpT           equality_op = EqualityOpT(),  ///< [in] Binary functor that defines the equality of keys. Default is cub::Equality().
-        cudaStream_t          stream=0,                     ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                  debug_synchronous=false)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchScanByKey<
-            KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT, ScanOpT, InitValueT, OffsetT>
-        ::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys_in,
-            d_values_in,
-            d_values_out,
-            equality_op,
-            scan_op,
-            init_value,
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-    /**
-     * \brief Computes a device-wide inclusive prefix sum-by-key with key equality defined by \p equality_op .
-     *
-     * \par
-     * - Supports non-commutative sum operators.
-     * - Results are not deterministic for pseudo-associative operators (e.g.,
-     *   addition of floating-point types). Results for pseudo-associative
-     *   operators may vary from run to run. Additional details can be found in
-     *   the [decoupled look-back] description.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the inclusive prefix sum-by-key of an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int num_items;      // e.g., 7
-     * int *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
-     * int *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * ...
-     *
-     * // Determine temporary device storage requirements for inclusive prefix sum
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceScan::InclusiveSumByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items);
-     *
-     * // Allocate temporary storage for inclusive prefix sum
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run inclusive prefix sum
-     * cub::DeviceScan::InclusiveSumByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items);
-     *
-     * // d_out <-- [8, 14, 7, 12, 15, 0, 9]
-     *
-     * \endcode
-     *
-     * \tparam KeysInputIteratorT      <b>[inferred]</b> Random-access input iterator type for reading scan keys inputs \iterator
-     * \tparam ValuesInputIteratorT    <b>[inferred]</b> Random-access input iterator type for reading scan values inputs \iterator
-     * \tparam ValuesOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing scan values outputs \iterator
-     * \tparam EqualityOpT             <b>[inferred]</b> Functor type having member <tt>T operator()(const T &a, const T &b)</tt> for binary operations that defines the equality of keys
-     *
-     * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-     */
-    template <
-        typename        KeysInputIteratorT,
-        typename        ValuesInputIteratorT,
-        typename        ValuesOutputIteratorT,
-        typename        EqualityOpT = Equality>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t InclusiveSumByKey(
-        void                  *d_temp_storage,              ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                &temp_storage_bytes,          ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        KeysInputIteratorT    d_keys_in,                    ///< [in] Random-access input iterator to the input sequence of key items
-        ValuesInputIteratorT  d_values_in,                  ///< [in] Random-access input iterator to the input sequence of value items
-        ValuesOutputIteratorT d_values_out,                 ///< [out] Random-access output iterator to the output sequence of value items
-        int                   num_items,                    ///< [in] Total number of input items (i.e., the length of \p d_keys_in and \p d_values_in)
-        EqualityOpT           equality_op = EqualityOpT(),  ///< [in] Binary functor that defines the equality of keys. Default is cub::Equality().
-        cudaStream_t          stream=0,                     ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                  debug_synchronous=false)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchScanByKey<
-            KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT, Sum, NullType, OffsetT>
-        ::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys_in,
-            d_values_in,
-            d_values_out,
-            equality_op,
-            Sum(),
-            NullType(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-    /**
-     * \brief Computes a device-wide inclusive prefix scan-by-key using the specified binary \p scan_op functor.
-     * The key equality is defined by \p equality_op .
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - Results are not deterministic for pseudo-associative operators (e.g.,
-     *   addition of floating-point types). Results for pseudo-associative
-     *   operators may vary from run to run. Additional details can be found in
-     *   the [decoupled look-back] description.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the inclusive prefix min-scan-by-key of an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     * #include <climits>       // for INT_MAX
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // CustomEqual functor
-     * struct CustomEqual
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return a == b;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;      // e.g., 7
-     * int          *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
-     * int          *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * CustomMin    min_op;
-     * CustomEqual  equality_op;
-     * ...
-     *
-     * // Determine temporary device storage requirements for inclusive prefix scan
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceScan::InclusiveScanByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
-     *
-     * // Allocate temporary storage for inclusive prefix scan
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run inclusive prefix min-scan
-     * cub::DeviceScan::InclusiveScanByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
-     *
-     * // d_out <-- [8, 6, 7, 5, 3, 0, 0]
-     *
-     * \endcode
-     *
-     * \tparam KeysInputIteratorT      <b>[inferred]</b> Random-access input iterator type for reading scan keys inputs \iterator
-     * \tparam ValuesInputIteratorT    <b>[inferred]</b> Random-access input iterator type for reading scan values inputs \iterator
-     * \tparam ValuesOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing scan values outputs \iterator
-     * \tparam ScanOp                  <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam EqualityOpT             <b>[inferred]</b> Functor type having member <tt>T operator()(const T &a, const T &b)</tt> for binary operations that defines the equality of keys
-     *
-     * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
-     */
-    template <
-        typename        KeysInputIteratorT,
-        typename        ValuesInputIteratorT,
-        typename        ValuesOutputIteratorT,
-        typename        ScanOpT,
-        typename        EqualityOpT = Equality>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t InclusiveScanByKey(
-        void                  *d_temp_storage,              ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                &temp_storage_bytes,          ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        KeysInputIteratorT    d_keys_in,                    ///< [in] Random-access input iterator to the input sequence of key items
-        ValuesInputIteratorT  d_values_in,                  ///< [in] Random-access input iterator to the input sequence of value items
-        ValuesOutputIteratorT d_values_out,                 ///< [out] Random-access output iterator to the output sequence of value items
-        ScanOpT               scan_op,                      ///< [in] Binary scan functor
-        int                   num_items,                    ///< [in] Total number of input items (i.e., the length of \p d_keys_in and \p d_values_in)
-        EqualityOpT           equality_op = EqualityOpT(),  ///< [in] Binary functor that defines the equality of keys. Default is cub::Equality().
-        cudaStream_t          stream=0,                     ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                  debug_synchronous=false)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchScanByKey<
-            KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT, ScanOpT, NullType, OffsetT>
-        ::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys_in,
-            d_values_in,
-            d_values_out,
-            equality_op,
-            scan_op,
-            NullType(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-    //@}  end member group
+  /******************************************************************//**
+   * \name Exclusive scans
+   *********************************************************************/
+  //@{
+
+  /**
+   * @brief Computes a device-wide exclusive prefix sum. The value of `0` is 
+   *        applied as the initial value, and is assigned to `*d_out`.
+   *
+   * @par
+   * - Supports non-commutative sum operators.
+   * - Results are not deterministic for pseudo-associative operators (e.g.,
+   *   addition of floating-point types). Results for pseudo-associative
+   *   operators may vary from run to run. Additional details can be found in
+   *   the [decoupled look-back] description.
+   * - When the input and output sequences are the same, the scan is 
+   *   performed in-place. 
+   * - When `d_in` and `d_out` are equal, the scan is performed in-place. The 
+   *   range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` 
+   *   shall not overlap in any other way.
+   * - @devicestorage
+   *
+   * @par Performance
+   * The following charts illustrate saturated exclusive sum performance across 
+   * different CUDA architectures for `int32` and `int64` items, respectively.
+   *
+   * @image html scan_int32.png
+   * @image html scan_int64.png
+   *
+   * @par Snippet
+   * The code snippet below illustrates the exclusive prefix sum of an `int`
+   * device vector.
+   * @par
+   * @code
+   * #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // input and output
+   * int  num_items;      // e.g., 7
+   * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceScan::ExclusiveSum(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_out, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run exclusive prefix sum
+   * cub::DeviceScan::ExclusiveSum(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_out, num_items);
+   *
+   * // d_out <-- [0, 8, 14, 21, 26, 29, 29]
+   *
+   * @endcode
+   *
+   * @tparam InputIteratorT 
+   *   **[inferred]** Random-access input iterator type for reading scan 
+   *   inputs \iterator
+   *
+   * @tparam OutputIteratorT    
+   *   **[inferred]** Random-access output iterator type for writing scan 
+   *   outputs \iterator
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no 
+   *   work is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in
+   *   Random-access iterator to the input sequence of data items
+   *
+   * @param[out] d_out
+   *   Random-access iterator to the output sequence of data items
+   *
+   * @param[in] num_items
+   *   Total number of input items (i.e., the length of `d_in`)
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within. 
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. May cause significant slowdown. 
+   *   Default is `false`.
+   *
+   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+   */
+  template <typename InputIteratorT, typename OutputIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ExclusiveSum(void *d_temp_storage,
+               size_t &temp_storage_bytes,
+               InputIteratorT d_in,
+               OutputIteratorT d_out,
+               int num_items,
+               cudaStream_t stream    = 0,
+               bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    // The output value type -- used as the intermediate accumulator
+    // Use the input value type per https://wg21.link/P0571
+    using OutputT = cub::detail::value_t<InputIteratorT>;
+
+    // Initial value
+    OutputT init_value = 0;
+
+    return DispatchScan<
+        InputIteratorT, OutputIteratorT, Sum, detail::InputValue<OutputT>,
+        OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out,
+                           Sum(), detail::InputValue<OutputT>(init_value),
+                           num_items, stream, debug_synchronous);
+  }
+
+  /**
+   * @brief Computes a device-wide exclusive prefix sum in-place. The value of 
+   *        `0` is applied as the initial value, and is assigned to `*d_data`.
+   *
+   * @par
+   * - Supports non-commutative sum operators.
+   * - Results are not deterministic for pseudo-associative operators (e.g.,
+   *   addition of floating-point types). Results for pseudo-associative
+   *   operators may vary from run to run. Additional details can be found in
+   *   the [decoupled look-back] description.
+   * - When the input and output sequences are the same, the scan is 
+   *   performed in-place. 
+   * - @devicestorage
+   *
+   * @par Performance
+   * The following charts illustrate saturated exclusive sum performance across 
+   * different CUDA architectures for `int32` and `int64` items, respectively.
+   *
+   * @image html scan_int32.png
+   * @image html scan_int64.png
+   *
+   * @par Snippet
+   * The code snippet below illustrates the exclusive prefix sum of an `int`
+   * device vector.
+   * @par
+   * @code
+   * #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // input and output
+   * int  num_items;      // e.g., 7
+   * int  *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceScan::ExclusiveSum(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_data, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run exclusive prefix sum
+   * cub::DeviceScan::ExclusiveSum(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_data, num_items);
+   *
+   * // d_data <-- [0, 8, 14, 21, 26, 29, 29]
+   *
+   * @endcode
+   *
+   * @tparam IteratorT 
+   *   **[inferred]** Random-access iterator type for reading scan 
+   *   inputs and wrigin scan outputs
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no 
+   *   work is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_data
+   *   Random-access iterator to the sequence of data items
+   *
+   * @param[in] num_items
+   *   Total number of input items (i.e., the length of `d_in`)
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within. 
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. May cause significant slowdown. 
+   *   Default is `false`.
+   *
+   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+   */
+  template <typename IteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ExclusiveSum(void *d_temp_storage,
+               size_t &temp_storage_bytes,
+               IteratorT d_data,
+               int num_items,
+               cudaStream_t stream    = 0,
+               bool debug_synchronous = false)
+  {
+    return ExclusiveSum(d_temp_storage,
+                        temp_storage_bytes,
+                        d_data,
+                        d_data,
+                        num_items,
+                        stream,
+                        debug_synchronous);
+  }
+
+  /**
+   * @brief Computes a device-wide exclusive prefix scan using the specified 
+   *        binary `scan_op` functor. The `init_value` value is applied as 
+   *        the initial value, and is assigned to `*d_out`.
+   *
+   * @par
+   * - Supports non-commutative scan operators.
+   * - Results are not deterministic for pseudo-associative operators (e.g.,
+   *   addition of floating-point types). Results for pseudo-associative
+   *   operators may vary from run to run. Additional details can be found in
+   *   the [decoupled look-back] description.
+   * - When `d_in` and `d_out` are equal, the scan is performed in-place. The 
+   *   range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` 
+   *   shall not overlap in any other way.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the exclusive prefix min-scan of an 
+   * `int` device vector
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+   * #include <climits>       // for INT_MAX
+   *
+   * // CustomMin functor
+   * struct CustomMin
+   * {
+   *     template <typename T>
+   *     CUB_RUNTIME_FUNCTION __forceinline__
+   *     T operator()(const T &a, const T &b) const {
+   *         return (b < a) ? b : a;
+   *     }
+   * };
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // input and output
+   * int          num_items;      // e.g., 7
+   * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+   * CustomMin    min_op;
+   * ...
+   *
+   * // Determine temporary device storage requirements for exclusive 
+   * // prefix scan
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceScan::ExclusiveScan(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_out, min_op, (int) INT_MAX, num_items);
+   *
+   * // Allocate temporary storage for exclusive prefix scan
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run exclusive prefix min-scan
+   * cub::DeviceScan::ExclusiveScan(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_out, min_op, (int) INT_MAX, num_items);
+   *
+   * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
+   *
+   * @endcode
+   *
+   * @tparam InputIteratorT   
+   *   **[inferred]** Random-access input iterator type for reading scan 
+   *   inputs \iterator
+   *
+   * @tparam OutputIteratorT  
+   *   **[inferred]** Random-access output iterator type for writing scan 
+   *   outputs \iterator
+   *
+   * @tparam ScanOp           
+   *   **[inferred]** Binary scan functor type having member 
+   *   `T operator()(const T &a, const T &b)`
+   * 
+   * @tparam InitValueT       
+   *  **[inferred]** Type of the `init_value` used Binary scan functor type 
+   *   having member `T operator()(const T &a, const T &b)`
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no 
+   *   work is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in
+   *   Random-access iterator to the input sequence of data items
+   *
+   * @param[out] d_out
+   *   Random-access iterator to the output sequence of data items
+   *
+   * @param[in] scan_op
+   *   Binary scan functor
+   *
+   * @param[in] init_value
+   *   Initial value to seed the exclusive scan (and is assigned to *d_out)
+   *
+   * @param[in] num_items
+   *   Total number of input items (i.e., the length of \p d_in)
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within. Default is 
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. May cause significant slowdown.  
+   *   Default is `false`.
+   *
+   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+   */
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename ScanOpT,
+            typename InitValueT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ExclusiveScan(void *d_temp_storage,
+                size_t &temp_storage_bytes,
+                InputIteratorT d_in,
+                OutputIteratorT d_out,
+                ScanOpT scan_op,
+                InitValueT init_value,
+                int num_items,
+                cudaStream_t stream    = 0,
+                bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int ;
+
+    return DispatchScan<InputIteratorT,
+                        OutputIteratorT,
+                        ScanOpT,
+                        detail::InputValue<InitValueT>,
+                        OffsetT>::Dispatch(d_temp_storage,
+                                           temp_storage_bytes,
+                                           d_in,
+                                           d_out,
+                                           scan_op,
+                                           detail::InputValue<InitValueT>(
+                                             init_value),
+                                           num_items,
+                                           stream,
+                                           debug_synchronous);
+  }
+
+  /**
+   * @brief Computes a device-wide exclusive prefix scan using the specified 
+   *        binary `scan_op` functor. The `init_value` value is applied as 
+   *        the initial value, and is assigned to `*d_data`.
+   *
+   * @par
+   * - Supports non-commutative scan operators.
+   * - Results are not deterministic for pseudo-associative operators (e.g.,
+   *   addition of floating-point types). Results for pseudo-associative
+   *   operators may vary from run to run. Additional details can be found in
+   *   the [decoupled look-back] description.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the exclusive prefix min-scan of an 
+   * `int` device vector
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+   * #include <climits>       // for INT_MAX
+   *
+   * // CustomMin functor
+   * struct CustomMin
+   * {
+   *     template <typename T>
+   *     CUB_RUNTIME_FUNCTION __forceinline__
+   *     T operator()(const T &a, const T &b) const {
+   *         return (b < a) ? b : a;
+   *     }
+   * };
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // input and output
+   * int          num_items;      // e.g., 7
+   * int          *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * CustomMin    min_op;
+   * ...
+   *
+   * // Determine temporary device storage requirements for exclusive 
+   * // prefix scan
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceScan::ExclusiveScan(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_data, min_op, (int) INT_MAX, num_items);
+   *
+   * // Allocate temporary storage for exclusive prefix scan
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run exclusive prefix min-scan
+   * cub::DeviceScan::ExclusiveScan(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_data, min_op, (int) INT_MAX, num_items);
+   *
+   * // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
+   *
+   * @endcode
+   *
+   * @tparam IteratorT   
+   *   **[inferred]** Random-access input iterator type for reading scan 
+   *   inputs and writing scan outputs
+   *
+   * @tparam ScanOp           
+   *   **[inferred]** Binary scan functor type having member 
+   *   `T operator()(const T &a, const T &b)`
+   * 
+   * @tparam InitValueT       
+   *  **[inferred]** Type of the `init_value` used Binary scan functor type 
+   *   having member `T operator()(const T &a, const T &b)`
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no 
+   *   work is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_data
+   *   Random-access iterator to the sequence of data items
+   *
+   * @param[in] scan_op
+   *   Binary scan functor
+   *
+   * @param[in] init_value
+   *   Initial value to seed the exclusive scan (and is assigned to *d_out)
+   *
+   * @param[in] num_items
+   *   Total number of input items (i.e., the length of \p d_in)
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within. Default is 
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. May cause significant slowdown.  
+   *   Default is `false`.
+   *
+   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+   */
+  template <typename IteratorT,
+            typename ScanOpT,
+            typename InitValueT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ExclusiveScan(void *d_temp_storage,
+                size_t &temp_storage_bytes,
+                IteratorT d_data,
+                ScanOpT scan_op,
+                InitValueT init_value,
+                int num_items,
+                cudaStream_t stream    = 0,
+                bool debug_synchronous = false)
+  {
+    return ExclusiveScan(d_temp_storage,
+                         temp_storage_bytes,
+                         d_data,
+                         d_data,
+                         scan_op,
+                         init_value,
+                         num_items,
+                         stream,
+                         debug_synchronous);
+  }
+
+  /**
+   * @brief Computes a device-wide exclusive prefix scan using the specified 
+   *        binary `scan_op` functor. The `init_value` value is provided as 
+   *        a future value.
+   *
+   * @par
+   * - Supports non-commutative scan operators.
+   * - Results are not deterministic for pseudo-associative operators (e.g.,
+   *   addition of floating-point types). Results for pseudo-associative
+   *   operators may vary from run to run. Additional details can be found in
+   *   the [decoupled look-back] description.
+   * - When `d_in` and `d_out` are equal, the scan is performed in-place. The 
+   *   range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` 
+   *   shall not overlap in any other way.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the exclusive prefix min-scan of an 
+   * `int` device vector
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+   * #include <climits>       // for INT_MAX
+   *
+   * // CustomMin functor
+   * struct CustomMin
+   * {
+   *     template <typename T>
+   *     CUB_RUNTIME_FUNCTION __forceinline__
+   *     T operator()(const T &a, const T &b) const {
+   *         return (b < a) ? b : a;
+   *     }
+   * };
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // input and output
+   * int          num_items;      // e.g., 7
+   * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+   * int          *d_init_iter;   // e.g., INT_MAX
+   * CustomMin    min_op;
+   *
+   * auto future_init_value = 
+   *   cub::FutureValue<InitialValueT, IterT>(d_init_iter);
+   *
+   * ...
+   *
+   * // Determine temporary device storage requirements for exclusive 
+   * // prefix scan
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceScan::ExclusiveScan(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_out, min_op, future_init_value, num_items);
+   *
+   * // Allocate temporary storage for exclusive prefix scan
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run exclusive prefix min-scan
+   * cub::DeviceScan::ExclusiveScan(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_out, min_op, future_init_value, num_items);
+   *
+   * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
+   *
+   * @endcode
+   *
+   * @tparam InputIteratorT   
+   *   **[inferred]** Random-access input iterator type for reading scan 
+   *   inputs \iterator
+   *
+   * @tparam OutputIteratorT  
+   *   **[inferred]** Random-access output iterator type for writing scan 
+   *   outputs \iterator
+   *
+   * @tparam ScanOp           
+   *   **[inferred]** Binary scan functor type having member 
+   *   `T operator()(const T &a, const T &b)`
+   * 
+   * @tparam InitValueT       
+   *  **[inferred]** Type of the `init_value` used Binary scan functor type 
+   *   having member `T operator()(const T &a, const T &b)`
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes 
+   *   Reference to size in bytes of \p d_temp_storage allocation
+   *
+   * @param[in] d_in 
+   *   Pointer to the input sequence of data items
+   *
+   * @param[out] d_out 
+   *   Pointer to the output sequence of data items
+   *
+   * @param[in] scan_op 
+   *   Binary scan functor
+   *
+   * @param[in] init_value 
+   *   Initial value to seed the exclusive scan (and is assigned to `*d_out`)
+   *
+   * @param[in] num_items 
+   *   Total number of input items (i.e., the length of `d_in`)
+   *
+   * @param[in] stream 
+   *   **[optional]** CUDA stream to launch kernels within. 
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous 
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. May cause significant slowdown.  
+   *   Default is `false`.
+   *
+   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+   */
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename ScanOpT,
+            typename InitValueT,
+            typename InitValueIterT = InitValueT *>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ExclusiveScan(void *d_temp_storage,
+                size_t &temp_storage_bytes,
+                InputIteratorT d_in,
+                OutputIteratorT d_out,
+                ScanOpT scan_op,
+                FutureValue<InitValueT, InitValueIterT> init_value,
+                int num_items,
+                cudaStream_t stream    = 0,
+                bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    return DispatchScan<InputIteratorT,
+                        OutputIteratorT,
+                        ScanOpT,
+                        detail::InputValue<InitValueT>,
+                        OffsetT>::Dispatch(d_temp_storage,
+                                           temp_storage_bytes,
+                                           d_in,
+                                           d_out,
+                                           scan_op,
+                                           detail::InputValue<InitValueT>(
+                                             init_value),
+                                           num_items,
+                                           stream,
+                                           debug_synchronous);
+  }
+
+  /**
+   * @brief Computes a device-wide exclusive prefix scan using the specified 
+   *        binary `scan_op` functor. The `init_value` value is provided as 
+   *        a future value.
+   *
+   * @par
+   * - Supports non-commutative scan operators.
+   * - Results are not deterministic for pseudo-associative operators (e.g.,
+   *   addition of floating-point types). Results for pseudo-associative
+   *   operators may vary from run to run. Additional details can be found in
+   *   the [decoupled look-back] description.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the exclusive prefix min-scan of an 
+   * `int` device vector
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+   * #include <climits>       // for INT_MAX
+   *
+   * // CustomMin functor
+   * struct CustomMin
+   * {
+   *     template <typename T>
+   *     CUB_RUNTIME_FUNCTION __forceinline__
+   *     T operator()(const T &a, const T &b) const {
+   *         return (b < a) ? b : a;
+   *     }
+   * };
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // input and output
+   * int          num_items;      // e.g., 7
+   * int          *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int          *d_init_iter;   // e.g., INT_MAX
+   * CustomMin    min_op;
+   *
+   * auto future_init_value = 
+   *   cub::FutureValue<InitialValueT, IterT>(d_init_iter);
+   *
+   * ...
+   *
+   * // Determine temporary device storage requirements for exclusive 
+   * // prefix scan
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceScan::ExclusiveScan(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_data, min_op, future_init_value, num_items);
+   *
+   * // Allocate temporary storage for exclusive prefix scan
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run exclusive prefix min-scan
+   * cub::DeviceScan::ExclusiveScan(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_data, min_op, future_init_value, num_items);
+   *
+   * // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
+   *
+   * @endcode
+   *
+   * @tparam IteratorT   
+   *   **[inferred]** Random-access input iterator type for reading scan 
+   *   inputs and writing scan outputs
+   *
+   * @tparam ScanOp           
+   *   **[inferred]** Binary scan functor type having member 
+   *   `T operator()(const T &a, const T &b)`
+   * 
+   * @tparam InitValueT       
+   *  **[inferred]** Type of the `init_value` used Binary scan functor type 
+   *   having member `T operator()(const T &a, const T &b)`
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes 
+   *   Reference to size in bytes of \p d_temp_storage allocation
+   *
+   * @param[in,out] d_data
+   *   Pointer to the sequence of data items
+   *
+   * @param[in] scan_op 
+   *   Binary scan functor
+   *
+   * @param[in] init_value 
+   *   Initial value to seed the exclusive scan (and is assigned to `*d_out`)
+   *
+   * @param[in] num_items 
+   *   Total number of input items (i.e., the length of `d_in`)
+   *
+   * @param[in] stream 
+   *   **[optional]** CUDA stream to launch kernels within. 
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous 
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. May cause significant slowdown.  
+   *   Default is `false`.
+   *
+   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+   */
+  template <typename IteratorT,
+            typename ScanOpT,
+            typename InitValueT,
+            typename InitValueIterT = InitValueT *>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ExclusiveScan(void *d_temp_storage,
+                size_t &temp_storage_bytes,
+                IteratorT d_data,
+                ScanOpT scan_op,
+                FutureValue<InitValueT, InitValueIterT> init_value,
+                int num_items,
+                cudaStream_t stream    = 0,
+                bool debug_synchronous = false)
+  {
+    return ExclusiveScan(d_temp_storage,
+                         temp_storage_bytes,
+                         d_data,
+                         d_data,
+                         scan_op,
+                         init_value,
+                         num_items,
+                         stream,
+                         debug_synchronous);
+  }
+
+  //@}  end member group
+  /******************************************************************//**
+   * @name Inclusive scans
+   *********************************************************************/
+  //@{
+
+
+  /**
+   * @brief Computes a device-wide inclusive prefix sum.
+   *
+   * @par
+   * - Supports non-commutative sum operators.
+   * - Results are not deterministic for pseudo-associative operators (e.g.,
+   *   addition of floating-point types). Results for pseudo-associative
+   *   operators may vary from run to run. Additional details can be found in
+   *   the [decoupled look-back] description.
+   * - When `d_in` and `d_out` are equal, the scan is performed in-place. The 
+   *   range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` 
+   *   shall not overlap in any other way.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the inclusive prefix sum of an `int`
+   * device vector.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // input and output
+   * int  num_items;      // e.g., 7
+   * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+   * ...
+   *
+   * // Determine temporary device storage requirements for inclusive 
+   * // prefix sum
+   * void     *d_temp_storage = nullptr;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceScan::InclusiveSum(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_out, num_items);
+   *
+   * // Allocate temporary storage for inclusive prefix sum
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run inclusive prefix sum
+   * cub::DeviceScan::InclusiveSum(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_out, num_items);
+   *
+   * // d_out <-- [8, 14, 21, 26, 29, 29, 38]
+   *
+   * @endcode
+   *
+   * @tparam InputIteratorT     
+   *   **[inferred]** Random-access input iterator type for reading scan 
+   *   inputs \iterator
+   *
+   * @tparam OutputIteratorT    
+   *   **[inferred]** Random-access output iterator type for writing scan 
+   *   outputs \iterator
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no 
+   *   work is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in  
+   *   Random-access iterator to the input sequence of data items
+   *
+   * @param[out] d_out  
+   *   Random-access iterator to the output sequence of data items
+   *
+   * @param[in] num_items  
+   *   Total number of input items (i.e., the length of `d_in`)
+   *
+   * @param[in] stream 
+   *   **[optional]** CUDA stream to launch kernels within. 
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous 
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. May cause significant slowdown.  
+   *   Default is `false`.
+   *
+   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+   */
+  template <typename InputIteratorT, typename OutputIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  InclusiveSum(void *d_temp_storage,
+               size_t &temp_storage_bytes,
+               InputIteratorT d_in,
+               OutputIteratorT d_out,
+               int num_items,
+               cudaStream_t stream    = 0,
+               bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    return DispatchScan<InputIteratorT,
+                        OutputIteratorT,
+                        Sum,
+                        NullType,
+                        OffsetT>::Dispatch(d_temp_storage,
+                                           temp_storage_bytes,
+                                           d_in,
+                                           d_out,
+                                           Sum(),
+                                           NullType(),
+                                           num_items,
+                                           stream,
+                                           debug_synchronous);
+  }
+
+  /**
+   * @brief Computes a device-wide inclusive prefix sum in-place.
+   *
+   * @par
+   * - Supports non-commutative sum operators.
+   * - Results are not deterministic for pseudo-associative operators (e.g.,
+   *   addition of floating-point types). Results for pseudo-associative
+   *   operators may vary from run to run. Additional details can be found in
+   *   the [decoupled look-back] description.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the inclusive prefix sum of an `int`
+   * device vector.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // input and output
+   * int  num_items;      // e.g., 7
+   * int  *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * ...
+   *
+   * // Determine temporary device storage requirements for inclusive 
+   * // prefix sum
+   * void     *d_temp_storage = nullptr;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceScan::InclusiveSum(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_data, num_items);
+   *
+   * // Allocate temporary storage for inclusive prefix sum
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run inclusive prefix sum
+   * cub::DeviceScan::InclusiveSum(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_data, num_items);
+   *
+   * // d_data <-- [8, 14, 21, 26, 29, 29, 38]
+   *
+   * @endcode
+   *
+   * @tparam IteratorT     
+   *   **[inferred]** Random-access input iterator type for reading scan 
+   *   inputs and writing scan outputs
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no 
+   *   work is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_data
+   *   Random-access iterator to the sequence of data items
+   *
+   * @param[in] num_items  
+   *   Total number of input items (i.e., the length of `d_in`)
+   *
+   * @param[in] stream 
+   *   **[optional]** CUDA stream to launch kernels within. 
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous 
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. May cause significant slowdown.  
+   *   Default is `false`.
+   *
+   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+   */
+  template <typename IteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  InclusiveSum(void *d_temp_storage,
+               size_t &temp_storage_bytes,
+               IteratorT d_data,
+               int num_items,
+               cudaStream_t stream    = 0,
+               bool debug_synchronous = false)
+  {
+    return InclusiveSum(d_temp_storage,
+                        temp_storage_bytes,
+                        d_data,
+                        d_data,
+                        num_items,
+                        stream,
+                        debug_synchronous);
+  }
+
+  /**
+   * @brief Computes a device-wide inclusive prefix scan using the specified 
+   *        binary `scan_op` functor.
+   *
+   * @par
+   * - Supports non-commutative scan operators.
+   * - Results are not deterministic for pseudo-associative operators (e.g.,
+   *   addition of floating-point types). Results for pseudo-associative
+   *   operators may vary from run to run. Additional details can be found in
+   *   the [decoupled look-back] description.
+   * - When `d_in` and `d_out` are equal, the scan is performed in-place. The 
+   *   range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` 
+   *   shall not overlap in any other way.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the inclusive prefix min-scan of an 
+   * `int` device vector.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+   * #include <climits>       // for INT_MAX
+   *
+   * // CustomMin functor
+   * struct CustomMin
+   * {
+   *     template <typename T>
+   *     CUB_RUNTIME_FUNCTION __forceinline__
+   *     T operator()(const T &a, const T &b) const {
+   *         return (b < a) ? b : a;
+   *     }
+   * };
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // input and output
+   * int          num_items;      // e.g., 7
+   * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+   * CustomMin    min_op;
+   * ...
+   *
+   * // Determine temporary device storage requirements for inclusive 
+   * // prefix scan
+   * void *d_temp_storage = nullptr;
+   * size_t temp_storage_bytes = 0;
+   * cub::DeviceScan::InclusiveScan(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_out, min_op, num_items);
+   *
+   * // Allocate temporary storage for inclusive prefix scan
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run inclusive prefix min-scan
+   * cub::DeviceScan::InclusiveScan(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_out, min_op, num_items);
+   *
+   * // d_out <-- [8, 6, 6, 5, 3, 0, 0]
+   *
+   * @endcode
+   *
+   * @tparam InputIteratorT   
+   *   **[inferred]** Random-access input iterator type for reading scan 
+   *   inputs \iterator
+   *
+   * @tparam OutputIteratorT  
+   *   **[inferred]** Random-access output iterator type for writing scan 
+   *   outputs \iterator
+   *
+   * @tparam ScanOp           
+   *   **[inferred]** Binary scan functor type having member 
+   *   `T operator()(const T &a, const T &b)`
+   *
+   * @param[in]  
+   *   d_temp_storage Device-accessible allocation of temporary storage. 
+   *   When `nullptr`, the required allocation size is written to 
+   *   `temp_storage_bytes` and no work is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in
+   *   Random-access iterator to the input sequence of data items
+   *
+   * @param[out] d_out
+   *   Random-access iterator to the output sequence of data items
+   *
+   * @param[in] scan_op
+   *   Binary scan functor
+   *
+   * @param[in] num_items
+   *   Total number of input items (i.e., the length of `d_in`)
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within. 
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. May cause significant slowdown.  
+   *   Default is `false`.
+   *
+   *
+   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+   */
+  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  InclusiveScan(void *d_temp_storage,
+                size_t &temp_storage_bytes,
+                InputIteratorT d_in,
+                OutputIteratorT d_out,
+                ScanOpT scan_op,
+                int num_items,
+                cudaStream_t stream    = 0,
+                bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    return DispatchScan<InputIteratorT,
+                        OutputIteratorT,
+                        ScanOpT,
+                        NullType,
+                        OffsetT>::Dispatch(d_temp_storage,
+                                           temp_storage_bytes,
+                                           d_in,
+                                           d_out,
+                                           scan_op,
+                                           NullType(),
+                                           num_items,
+                                           stream,
+                                           debug_synchronous);
+  }
+
+  /**
+   * @brief Computes a device-wide inclusive prefix scan using the specified 
+   *        binary `scan_op` functor.
+   *
+   * @par
+   * - Supports non-commutative scan operators.
+   * - Results are not deterministic for pseudo-associative operators (e.g.,
+   *   addition of floating-point types). Results for pseudo-associative
+   *   operators may vary from run to run. Additional details can be found in
+   *   the [decoupled look-back] description.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the inclusive prefix min-scan of an 
+   * `int` device vector.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+   * #include <climits>       // for INT_MAX
+   *
+   * // CustomMin functor
+   * struct CustomMin
+   * {
+   *     template <typename T>
+   *     CUB_RUNTIME_FUNCTION __forceinline__
+   *     T operator()(const T &a, const T &b) const {
+   *         return (b < a) ? b : a;
+   *     }
+   * };
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // input and output
+   * int          num_items;      // e.g., 7
+   * int          *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * CustomMin    min_op;
+   * ...
+   *
+   * // Determine temporary device storage requirements for inclusive 
+   * // prefix scan
+   * void *d_temp_storage = nullptr;
+   * size_t temp_storage_bytes = 0;
+   * cub::DeviceScan::InclusiveScan(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_data, min_op, num_items);
+   *
+   * // Allocate temporary storage for inclusive prefix scan
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run inclusive prefix min-scan
+   * cub::DeviceScan::InclusiveScan(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_out, min_op, num_items);
+   *
+   * // d_data <-- [8, 6, 6, 5, 3, 0, 0]
+   *
+   * @endcode
+   *
+   * @tparam IteratorT   
+   *   **[inferred]** Random-access input iterator type for reading scan 
+   *   inputs and writing scan outputs
+   *
+   * @tparam ScanOp           
+   *   **[inferred]** Binary scan functor type having member 
+   *   `T operator()(const T &a, const T &b)`
+   *
+   * @param[in]  
+   *   d_temp_storage Device-accessible allocation of temporary storage. 
+   *   When `nullptr`, the required allocation size is written to 
+   *   `temp_storage_bytes` and no work is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_data
+   *   Random-access iterator to the sequence of data items
+   *
+   * @param[in] scan_op
+   *   Binary scan functor
+   *
+   * @param[in] num_items
+   *   Total number of input items (i.e., the length of `d_in`)
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within. 
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. May cause significant slowdown.  
+   *   Default is `false`.
+   *
+   *
+   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+   */
+  template <typename IteratorT, typename ScanOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  InclusiveScan(void *d_temp_storage,
+                size_t &temp_storage_bytes,
+                IteratorT d_data,
+                ScanOpT scan_op,
+                int num_items,
+                cudaStream_t stream    = 0,
+                bool debug_synchronous = false)
+  {
+    return InclusiveScan(d_temp_storage,
+                         temp_storage_bytes,
+                         d_data,
+                         d_data,
+                         scan_op,
+                         num_items,
+                         stream,
+                         debug_synchronous);
+  }
+
+  /**
+   * @brief Computes a device-wide exclusive prefix sum-by-key with key equality
+   *        defined by `equality_op`. The value of `0` is applied as the initial 
+   *        value, and is assigned to the beginning of each segment in 
+   *        `d_values_out`.
+   *
+   * @par
+   * - Supports non-commutative sum operators.
+   * - Results are not deterministic for pseudo-associative operators (e.g.,
+   *   addition of floating-point types). Results for pseudo-associative
+   *   operators may vary from run to run. Additional details can be found in
+   *   the [decoupled look-back] description.
+   * - `d_keys_in` may equal `d_values_out` but the range 
+   *   `[d_keys_in, d_keys_in + num_items)` and the range 
+   *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
+   * - `d_values_in` may equal `d_values_out` but the range 
+   *   `[d_values_in, d_values_in + num_items)` and the range 
+   *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the exclusive prefix sum-by-key of an 
+   * `int` device vector.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // input and output
+   * int num_items;      // e.g., 7
+   * int *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
+   * int *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = nullptr;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceScan::ExclusiveSumByKey(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_keys_in, d_values_in, d_values_out, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run exclusive prefix sum
+   * cub::DeviceScan::ExclusiveSumByKey(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_keys_in, d_values_in, d_values_out, num_items);
+   *
+   * // d_values_out <-- [0, 8, 0, 7, 12, 0, 0]
+   *
+   * @endcode
+   *
+   * @tparam KeysInputIteratorT      
+   *   **[inferred]** Random-access input iterator type for reading scan keys 
+   *   inputs \iterator
+   * 
+   * @tparam ValuesInputIteratorT    
+   *   **[inferred]** Random-access input iterator type for reading scan 
+   *   values inputs \iterator
+   *
+   * @tparam ValuesOutputIteratorT   
+   *   **[inferred]** Random-access output iterator type for writing scan 
+   *   values outputs \iterator
+   *
+   * @tparam EqualityOpT             
+   *   **[inferred]** Functor type having member 
+   *   `T operator()(const T &a, const T &b)` for binary operations that 
+   *   defines the equality of keys
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no 
+   *   work is done.
+   *
+   * @param[in,out] temp_storage_bytes 
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_keys_in 
+   *   Random-access input iterator to the input sequence of key items
+   *
+   * @param[in] d_values_in 
+   *   Random-access input iterator to the input sequence of value items
+   *
+   * @param[out] d_values_out 
+   *   Random-access output iterator to the output sequence of value items
+   *
+   * @param[in] num_items 
+   *   Total number of input items (i.e., the length of `d_keys_in` and 
+   *   `d_values_in`)
+   *
+   * @param[in] equality_op 
+   *   Binary functor that defines the equality of keys. 
+   *   Default is cub::Equality().
+   *
+   * @param[in] stream 
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous 
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. May cause significant slowdown.  
+   *   Default is `false`.
+   *
+   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+   */
+  template <typename KeysInputIteratorT,
+            typename ValuesInputIteratorT,
+            typename ValuesOutputIteratorT,
+            typename EqualityOpT = Equality>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ExclusiveSumByKey(void *d_temp_storage,
+                    size_t &temp_storage_bytes,
+                    KeysInputIteratorT d_keys_in,
+                    ValuesInputIteratorT d_values_in,
+                    ValuesOutputIteratorT d_values_out,
+                    int num_items,
+                    EqualityOpT equality_op = EqualityOpT(),
+                    cudaStream_t stream     = 0,
+                    bool debug_synchronous  = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    // The output value type -- used as the intermediate accumulator
+    // Use the input value type per https://wg21.link/P0571
+    using OutputT = cub::detail::value_t<ValuesInputIteratorT>;
+
+    // Initial value
+    OutputT init_value = 0;
+
+    return DispatchScanByKey<KeysInputIteratorT,
+                             ValuesInputIteratorT,
+                             ValuesOutputIteratorT,
+                             EqualityOpT,
+                             Sum,
+                             OutputT,
+                             OffsetT>::Dispatch(d_temp_storage,
+                                                temp_storage_bytes,
+                                                d_keys_in,
+                                                d_values_in,
+                                                d_values_out,
+                                                equality_op,
+                                                Sum(),
+                                                init_value,
+                                                num_items,
+                                                stream,
+                                                debug_synchronous);
+  }
+
+  /**
+   * @brief Computes a device-wide exclusive prefix scan-by-key using the 
+   *        specified binary `scan_op` functor. The key equality is defined by 
+   *        `equality_op`.  The `init_value` value is applied as the initial 
+   *        value, and is assigned to the beginning of each segment in 
+   *        `d_values_out`.
+   *
+   * @par
+   * - Supports non-commutative scan operators.
+   * - Results are not deterministic for pseudo-associative operators (e.g.,
+   *   addition of floating-point types). Results for pseudo-associative
+   *   operators may vary from run to run. Additional details can be found in
+   *   the [decoupled look-back] description.
+   * - `d_keys_in` may equal `d_values_out` but the range 
+   *   `[d_keys_in, d_keys_in + num_items)` and the range 
+   *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
+   * - `d_values_in` may equal `d_values_out` but the range 
+   *   `[d_values_in, d_values_in + num_items)` and the range 
+   *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the exclusive prefix min-scan-by-key of 
+   * an `int` device vector
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+   * #include <climits>       // for INT_MAX
+   *
+   * // CustomMin functor
+   * struct CustomMin
+   * {
+   *     template <typename T>
+   *     CUB_RUNTIME_FUNCTION __forceinline__
+   *     T operator()(const T &a, const T &b) const {
+   *         return (b < a) ? b : a;
+   *     }
+   * };
+   *
+   * // CustomEqual functor
+   * struct CustomEqual
+   * {
+   *     template <typename T>
+   *     CUB_RUNTIME_FUNCTION __forceinline__
+   *     T operator()(const T &a, const T &b) const {
+   *         return a == b;
+   *     }
+   * };
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // input and output
+   * int          num_items;      // e.g., 7
+   * int          *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
+   * int          *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int          *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
+   * CustomMin    min_op;
+   * CustomEqual  equality_op;
+   * ...
+   *
+   * // Determine temporary device storage requirements for exclusive 
+   * // prefix scan
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceScan::ExclusiveScanByKey(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_keys_in, d_values_in, d_values_out, min_op, 
+   *   (int) INT_MAX, num_items, equality_op);
+   *
+   * // Allocate temporary storage for exclusive prefix scan
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run exclusive prefix min-scan
+   * cub::DeviceScan::ExclusiveScanByKey(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_keys_in, d_values_in, d_values_out, min_op, 
+   *   (int) INT_MAX, num_items, equality_op);
+   *
+   * // d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0]
+   *
+   * @endcode
+   *
+   * @tparam KeysInputIteratorT      
+   *   **[inferred]** Random-access input iterator type for reading scan keys 
+   *   inputs \iterator
+   *
+   * @tparam ValuesInputIteratorT    
+   *   **[inferred]** Random-access input iterator type for reading scan values 
+   *   inputs \iterator
+   *
+   * @tparam ValuesOutputIteratorT   
+   *   **[inferred]** Random-access output iterator type for writing scan values 
+   *   outputs \iterator
+   *
+   * @tparam ScanOp                  
+   *   **[inferred]** Binary scan functor type having member 
+   *   `T operator()(const T &a, const T &b)`
+   *
+   * @tparam InitValueT              
+   *   **[inferred]** Type of the `init_value` value used in Binary scan 
+   *   functor type having member `T operator()(const T &a, const T &b)`
+   *
+   * @tparam EqualityOpT             
+   *   **[inferred]** Functor type having member 
+   *   `T operator()(const T &a, const T &b)` for binary operations that 
+   *   defines the equality of keys
+   *
+   *  @param[in] d_temp_storage 
+   *    Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *    required allocation size is written to `temp_storage_bytes` and no work 
+   *    is done.
+   *
+   *  @param[in,out] temp_storage_bytes 
+   *    Reference to size in bytes of `d_temp_storage` allocation
+   *
+   *  @param[in] d_keys_in 
+   *    Random-access input iterator to the input sequence of key items
+   *
+   *  @param[in] d_values_in 
+   *    Random-access input iterator to the input sequence of value items
+   *
+   *  @param[out] d_values_out 
+   *    Random-access output iterator to the output sequence of value items
+   *
+   *  @param[in] scan_op 
+   *    Binary scan functor
+   *
+   *  @param[in] init_value 
+   *    Initial value to seed the exclusive scan (and is assigned to the 
+   *    beginning of each segment in `d_values_out`)
+   *
+   *  @param[in] num_items 
+   *    Total number of input items (i.e., the length of `d_keys_in` and 
+   *    `d_values_in`)
+   *
+   *  @param[in] equality_op 
+   *    Binary functor that defines the equality of keys. 
+   *    Default is cub::Equality().
+   *
+   *  @param[in] stream   
+   *    **[optional]** CUDA stream to launch kernels within.  
+   *    Default is stream<sub>0</sub>.
+   *
+   *  @param[in] debug_synchronous       
+   *    **[optional]** Whether or not to synchronize the stream after every 
+   *    kernel launch to check for errors. May cause significant slowdown.  
+   *    Default is `false`.
+   *
+   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+   */
+  template <typename KeysInputIteratorT,
+            typename ValuesInputIteratorT,
+            typename ValuesOutputIteratorT,
+            typename ScanOpT,
+            typename InitValueT,
+            typename EqualityOpT = Equality>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ExclusiveScanByKey(void *d_temp_storage,
+                     size_t &temp_storage_bytes,
+                     KeysInputIteratorT d_keys_in,
+                     ValuesInputIteratorT d_values_in,
+                     ValuesOutputIteratorT d_values_out,
+                     ScanOpT scan_op,
+                     InitValueT init_value,
+                     int num_items,
+                     EqualityOpT equality_op = EqualityOpT(),
+                     cudaStream_t stream     = 0,
+                     bool debug_synchronous  = false)
+  {
+      // Signed integer type for global offsets
+      using OffsetT = int ;
+
+      return DispatchScanByKey<KeysInputIteratorT,
+                               ValuesInputIteratorT,
+                               ValuesOutputIteratorT,
+                               EqualityOpT,
+                               ScanOpT,
+                               InitValueT,
+                               OffsetT>::Dispatch(d_temp_storage,
+                                                  temp_storage_bytes,
+                                                  d_keys_in,
+                                                  d_values_in,
+                                                  d_values_out,
+                                                  equality_op,
+                                                  scan_op,
+                                                  init_value,
+                                                  num_items,
+                                                  stream,
+                                                  debug_synchronous);
+  }
+
+  /**
+   * @brief Computes a device-wide inclusive prefix sum-by-key with key 
+   *        equality defined by `equality_op`.
+   *
+   * @par
+   * - Supports non-commutative sum operators.
+   * - Results are not deterministic for pseudo-associative operators (e.g.,
+   *   addition of floating-point types). Results for pseudo-associative
+   *   operators may vary from run to run. Additional details can be found in
+   *   the [decoupled look-back] description.
+   * - `d_keys_in` may equal `d_values_out` but the range 
+   *   `[d_keys_in, d_keys_in + num_items)` and the range 
+   *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
+   * - `d_values_in` may equal `d_values_out` but the range 
+   *   `[d_values_in, d_values_in + num_items)` and the range 
+   *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the inclusive prefix sum-by-key of an 
+   * `int` device vector.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // input and output
+   * int num_items;      // e.g., 7
+   * int *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
+   * int *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
+   * ...
+   *
+   * // Determine temporary device storage requirements for inclusive prefix sum
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceScan::InclusiveSumByKey(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_keys_in, d_values_in, d_values_out, num_items);
+   *
+   * // Allocate temporary storage for inclusive prefix sum
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run inclusive prefix sum
+   * cub::DeviceScan::InclusiveSumByKey(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_keys_in, d_values_in, d_values_out, num_items);
+   *
+   * // d_out <-- [8, 14, 7, 12, 15, 0, 9]
+   *
+   * @endcode
+   *
+   * @tparam KeysInputIteratorT      
+   *   **[inferred]** Random-access input iterator type for reading scan 
+   *   keys inputs \iterator
+   * 
+   * @tparam ValuesInputIteratorT    
+   *   **[inferred]** Random-access input iterator type for reading scan 
+   *   values inputs \iterator
+   * 
+   * @tparam ValuesOutputIteratorT   
+   *   **[inferred]** Random-access output iterator type for writing scan 
+   *   values outputs \iterator
+   * 
+   * @tparam EqualityOpT             
+   *   **[inferred]** Functor type having member 
+   *   `T operator()(const T &a, const T &b)` for binary operations that 
+   *   defines the equality of keys
+   *
+   *  @param[in] d_temp_storage 
+   *    Device-accessible allocation of temporary storage.  
+   *    When `nullptr`, the required allocation size is written to 
+   *    `temp_storage_bytes` and no work is done.
+   * 
+   *  @param[in,out] temp_storage_bytes 
+   *    Reference to size in bytes of `d_temp_storage` allocation
+   * 
+   *  @param[in] d_keys_in 
+   *    Random-access input iterator to the input sequence of key items
+   * 
+   *  @param[in] d_values_in 
+   *    Random-access input iterator to the input sequence of value items
+   * 
+   *  @param[out] d_values_out 
+   *    Random-access output iterator to the output sequence of value items
+   * 
+   *  @param[in] num_items 
+   *    Total number of input items (i.e., the length of `d_keys_in` and 
+   *    `d_values_in`)
+   * 
+   *  @param[in] equality_op 
+   *    Binary functor that defines the equality of keys. 
+   *    Default is cub::Equality().
+   * 
+   *  @param[in] stream 
+   *    **[optional]** CUDA stream to launch kernels within.  
+   *    Default is stream<sub>0</sub>.
+   * 
+   *  @param[in] debug_synchronous
+   *    **[optional]** Whether or not to synchronize the stream after every 
+   *    kernel launch to check for errors. May cause significant slowdown.  
+   *    Default is `false`.
+   *
+   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+   */
+  template <typename KeysInputIteratorT,
+            typename ValuesInputIteratorT,
+            typename ValuesOutputIteratorT,
+            typename EqualityOpT = Equality>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  InclusiveSumByKey(void *d_temp_storage,
+                    size_t &temp_storage_bytes,
+                    KeysInputIteratorT d_keys_in,
+                    ValuesInputIteratorT d_values_in,
+                    ValuesOutputIteratorT d_values_out,
+                    int num_items,
+                    EqualityOpT equality_op = EqualityOpT(),
+                    cudaStream_t stream     = 0,
+                    bool debug_synchronous  = false)
+  {
+      // Signed integer type for global offsets
+      using OffsetT = int ;
+
+      return DispatchScanByKey<KeysInputIteratorT,
+                               ValuesInputIteratorT,
+                               ValuesOutputIteratorT,
+                               EqualityOpT,
+                               Sum,
+                               NullType,
+                               OffsetT>::Dispatch(d_temp_storage,
+                                                  temp_storage_bytes,
+                                                  d_keys_in,
+                                                  d_values_in,
+                                                  d_values_out,
+                                                  equality_op,
+                                                  Sum(),
+                                                  NullType(),
+                                                  num_items,
+                                                  stream,
+                                                  debug_synchronous);
+  }
+
+  /**
+   * @brief Computes a device-wide inclusive prefix scan-by-key using the 
+   *        specified binary `scan_op` functor. The key equality is defined 
+   *        by `equality_op`.
+   *
+   * @par
+   * - Supports non-commutative scan operators.
+   * - Results are not deterministic for pseudo-associative operators (e.g.,
+   *   addition of floating-point types). Results for pseudo-associative
+   *   operators may vary from run to run. Additional details can be found in
+   *   the [decoupled look-back] description.
+   * - `d_keys_in` may equal `d_values_out` but the range 
+   *   `[d_keys_in, d_keys_in + num_items)` and the range 
+   *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
+   * - `d_values_in` may equal `d_values_out` but the range 
+   *   `[d_values_in, d_values_in + num_items)` and the range 
+   *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the inclusive prefix min-scan-by-key 
+   * of an `int` device vector.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+   * #include <climits>       // for INT_MAX
+   *
+   * // CustomMin functor
+   * struct CustomMin
+   * {
+   *     template <typename T>
+   *     CUB_RUNTIME_FUNCTION __forceinline__
+   *     T operator()(const T &a, const T &b) const {
+   *         return (b < a) ? b : a;
+   *     }
+   * };
+   *
+   * // CustomEqual functor
+   * struct CustomEqual
+   * {
+   *     template <typename T>
+   *     CUB_RUNTIME_FUNCTION __forceinline__
+   *     T operator()(const T &a, const T &b) const {
+   *         return a == b;
+   *     }
+   * };
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // input and output
+   * int          num_items;      // e.g., 7
+   * int          *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
+   * int          *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int          *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
+   * CustomMin    min_op;
+   * CustomEqual  equality_op;
+   * ...
+   *
+   * // Determine temporary device storage requirements for inclusive prefix scan
+   * void *d_temp_storage = NULL;
+   * size_t temp_storage_bytes = 0;
+   * cub::DeviceScan::InclusiveScanByKey(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
+   *
+   * // Allocate temporary storage for inclusive prefix scan
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run inclusive prefix min-scan
+   * cub::DeviceScan::InclusiveScanByKey(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
+   *
+   * // d_out <-- [8, 6, 7, 5, 3, 0, 0]
+   *
+   * @endcode
+   *
+   * @tparam KeysInputIteratorT      
+   *   **[inferred]** Random-access input iterator type for reading scan keys 
+   *   inputs \iterator
+   *
+   * @tparam ValuesInputIteratorT    
+   *   **[inferred]** Random-access input iterator type for reading scan 
+   *   values inputs \iterator
+   *
+   * @tparam ValuesOutputIteratorT   
+   *   **[inferred]** Random-access output iterator type for writing scan 
+   *   values outputs \iterator
+   *
+   * @tparam ScanOp                  
+   *   **[inferred]** Binary scan functor type having member 
+   *   `T operator()(const T &a, const T &b)`
+   *
+   * @tparam EqualityOpT             
+   *   **[inferred]** Functor type having member 
+   *   `T operator()(const T &a, const T &b)` for binary operations that 
+   *   defines the equality of keys
+   *
+   *  @param[in] d_temp_storage 
+   *    Device-accessible allocation of temporary storage.  
+   *    When `nullptr`, the required allocation size is written to 
+   *    `temp_storage_bytes` and no work is done.
+   * 
+   *  @param[in,out] temp_storage_bytes 
+   *    Reference to size in bytes of `d_temp_storage` allocation
+   * 
+   *  @param[in] d_keys_in 
+   *    Random-access input iterator to the input sequence of key items
+   * 
+   *  @param[in] d_values_in 
+   *    Random-access input iterator to the input sequence of value items
+   * 
+   *  @param[out] d_values_out 
+   *    Random-access output iterator to the output sequence of value items
+   * 
+   *  @param[in] scan_op 
+   *    Binary scan functor
+   * 
+   *  @param[in] num_items 
+   *    Total number of input items (i.e., the length of `d_keys_in` and 
+   *    `d_values_in`)
+   * 
+   *  @param[in] equality_op 
+   *    Binary functor that defines the equality of keys. 
+   *    Default is cub::Equality().
+   * 
+   *  @param[in] stream 
+   *    **[optional]** CUDA stream to launch kernels within.  
+   *    Default is stream<sub>0</sub>.
+   * 
+   *  @param[in] debug_synchronous 
+   *    **[optional]** Whether or not to synchronize the stream after every 
+   *    kernel launch to check for errors. May cause significant slowdown.  
+   *    Default is `false`.
+   * 
+   * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+   */
+  template <typename KeysInputIteratorT,
+            typename ValuesInputIteratorT,
+            typename ValuesOutputIteratorT,
+            typename ScanOpT,
+            typename EqualityOpT = Equality>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  InclusiveScanByKey(void *d_temp_storage,
+                     size_t &temp_storage_bytes,
+                     KeysInputIteratorT d_keys_in,
+                     ValuesInputIteratorT d_values_in,
+                     ValuesOutputIteratorT d_values_out,
+                     ScanOpT scan_op,
+                     int num_items,
+                     EqualityOpT equality_op = EqualityOpT(),
+                     cudaStream_t stream     = 0,
+                     bool debug_synchronous  = false)
+  {
+      // Signed integer type for global offsets
+      using OffsetT = int;
+
+      return DispatchScanByKey<KeysInputIteratorT,
+                               ValuesInputIteratorT,
+                               ValuesOutputIteratorT,
+                               EqualityOpT,
+                               ScanOpT,
+                               NullType,
+                               OffsetT>::Dispatch(d_temp_storage,
+                                                  temp_storage_bytes,
+                                                  d_keys_in,
+                                                  d_values_in,
+                                                  d_values_out,
+                                                  equality_op,
+                                                  scan_op,
+                                                  NullType(),
+                                                  num_items,
+                                                  stream,
+                                                  debug_synchronous);
+  }
 
+  //@}  end member group
 };
 
 /**
- * \example example_device_scan.cu
+ * @example example_device_scan.cu
  */
 
 CUB_NAMESPACE_END
diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh
index c2d04588be..2d452d3323 100644
--- a/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/device/dispatch/dispatch_scan.cuh
@@ -156,7 +156,7 @@ struct DeviceScanPolicy
                 128, 12,                                        ///< Threads per block, items per thread
                 OutputT,
                 BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
+                LOAD_CA,
                 BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
                 BLOCK_SCAN_RAKING>
             ScanPolicyT;
@@ -170,7 +170,7 @@ struct DeviceScanPolicy
                 128, 12,                                        ///< Threads per block, items per thread
                 OutputT,
                 BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
+                LOAD_CA,
                 ScanTransposedStore,
                 BLOCK_SCAN_WARP_SCANS>
             ScanPolicyT;
@@ -289,6 +289,12 @@ struct DispatchScan:
         typedef typename ActivePolicyT::ScanPolicyT Policy;
         typedef typename cub::ScanTileState<OutputT> ScanTileStateT;
 
+        // `LOAD_LDG` makes in-place execution UB and doesn't lead to better
+        // performance. 
+        static_assert(
+          Policy::LOAD_MODIFIER != CacheLoadModifier::LOAD_LDG,
+          "The memory consistency model does not apply to texture accesses");
+
         cudaError error = cudaSuccess;
         do
         {
diff --git a/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/device/dispatch/dispatch_scan_by_key.cuh
index ab71990e88..ff22a6208c 100644
--- a/cub/device/dispatch/dispatch_scan_by_key.cuh
+++ b/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -63,10 +63,12 @@ template <
     typename EqualityOp,                  ///< Equality functor type
     typename ScanOpT,                     ///< Scan functor type
     typename InitValueT,                  ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
-    typename OffsetT>                     ///< Signed integer type for global offsets
+    typename OffsetT,                     ///< Signed integer type for global offsets
+    typename KeyT = cub::detail::value_t<KeysInputIteratorT>>
 __launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THREADS))
 __global__ void DeviceScanByKeyKernel(
     KeysInputIteratorT    d_keys_in,          ///< Input keys data
+    KeyT                 *d_keys_prev_in,     ///< Predecessor items for each tile
     ValuesInputIteratorT  d_values_in,        ///< Input values data
     ValuesOutputIteratorT d_values_out,       ///< Output values data
     ScanByKeyTileStateT   tile_state,         ///< Tile status interface
@@ -96,6 +98,7 @@ __global__ void DeviceScanByKeyKernel(
     AgentScanByKeyT(
         temp_storage,
         d_keys_in,
+        d_keys_prev_in,
         d_values_in,
         d_values_out,
         equality_op,
@@ -107,6 +110,25 @@ __global__ void DeviceScanByKeyKernel(
         start_tile);
 }
 
+template <typename ScanTileStateT, typename KeysInputIteratorT>
+__global__ void DeviceScanByKeyInitKernel(
+  ScanTileStateT tile_state,
+  KeysInputIteratorT d_keys_in,
+  cub::detail::value_t<KeysInputIteratorT> *d_keys_prev_in,
+  unsigned items_per_tile,
+  int num_tiles)
+{
+  // Initialize tile status
+  tile_state.InitializeStatus(num_tiles);
+
+  const unsigned tid       = threadIdx.x + blockDim.x * blockIdx.x;
+  const unsigned tile_base = tid * items_per_tile;
+
+  if (tid > 0 && tid < num_tiles)
+  {
+    d_keys_prev_in[tid] = d_keys_in[tile_base - 1];
+  }
+}
 
 /******************************************************************************
  * Policy
@@ -138,7 +160,7 @@ struct DeviceScanByKeyPolicy
         typedef AgentScanByKeyPolicy<
                 128, ITEMS_PER_THREAD,
                 BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_LDG,
+                LOAD_CA,
                 BLOCK_SCAN_WARP_SCANS,
                 BLOCK_STORE_WARP_TRANSPOSE>
             ScanByKeyPolicyT;
@@ -158,7 +180,7 @@ struct DeviceScanByKeyPolicy
         typedef AgentScanByKeyPolicy<
                 256, ITEMS_PER_THREAD,
                 BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_LDG,
+                LOAD_CA,
                 BLOCK_SCAN_WARP_SCANS,
                 BLOCK_STORE_WARP_TRANSPOSE>
             ScanByKeyPolicyT;
@@ -283,11 +305,13 @@ struct DispatchScanByKey:
             int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));
 
             // Specify temporary storage allocation requirements
-            size_t  allocation_sizes[1];
+            size_t  allocation_sizes[2];
             if (CubDebug(error = ScanByKeyTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
 
+            allocation_sizes[1] = sizeof(KeyT) * (num_tiles + 1);
+
             // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
-            void* allocations[1] = {};
+            void* allocations[2] = {};
             if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
             if (d_temp_storage == NULL)
             {
@@ -299,6 +323,8 @@ struct DispatchScanByKey:
             if (num_items == 0)
                 break;
 
+            KeyT *d_keys_prev_in = reinterpret_cast<KeyT*>(allocations[1]);
+
             // Construct the tile status interface
             ScanByKeyTileStateT tile_state;
             if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
@@ -310,7 +336,7 @@ struct DispatchScanByKey:
             // Invoke init_kernel to initialize tile descriptors
             THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
                 init_grid_size, INIT_KERNEL_THREADS, 0, stream
-            ).doit(init_kernel, tile_state, num_tiles);
+            ).doit(init_kernel, tile_state, d_keys_in, d_keys_prev_in, tile_size, num_tiles);
 
             // Check for failure to launch
             if (CubDebug(error = cudaPeekAtLastError())) break;
@@ -318,7 +344,6 @@ struct DispatchScanByKey:
             // Sync the stream if specified to flush runtime errors
             if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
 
-
             // Get SM occupancy for scan_kernel
             int scan_sm_occupancy;
             if (CubDebug(error = MaxSmOccupancy(
@@ -344,6 +369,7 @@ struct DispatchScanByKey:
                 ).doit(
                     scan_kernel,
                     d_keys_in,
+                    d_keys_prev_in,
                     d_values_in,
                     d_values_out,
                     tile_state,
@@ -375,7 +401,7 @@ struct DispatchScanByKey:
         typedef ReduceByKeyScanTileState<OutputT, OffsetT> ScanByKeyTileStateT;
         // Ensure kernels are instantiated.
         return Invoke<ActivePolicyT>(
-            DeviceScanInitKernel<ScanByKeyTileStateT>,
+            DeviceScanByKeyInitKernel<ScanByKeyTileStateT, KeysInputIteratorT>,
             DeviceScanByKeyKernel<
                 MaxPolicyT, KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT,
                 ScanByKeyTileStateT, EqualityOp, ScanOpT, InitValueT, OffsetT>
diff --git a/test/test_device_scan.cu b/test/test_device_scan.cu
index 7c4efdd41a..6432d5caac 100644
--- a/test/test_device_scan.cu
+++ b/test/test_device_scan.cu
@@ -89,124 +89,295 @@ struct WrapperFunctor
 /**
  * Dispatch to exclusive scan entrypoint
  */
-template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       /*dispatch_to*/,
-    IsPrimitiveT        /*is_primitive*/,
-    int                 timing_timing_iterations,
-    size_t              */*d_temp_storage_bytes*/,
-    cudaError_t         */*d_cdp_error*/,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    ScanOpT             scan_op,
-    InitialValueT       initial_value,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
+template <typename IsPrimitiveT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ScanOpT,
+          typename InitialValueT,
+          typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+Dispatch(Int2Type<true> /*in_place*/,
+         Int2Type<CUB> /*dispatch_to*/,
+         IsPrimitiveT /*is_primitive*/,
+         int timing_timing_iterations,
+         size_t * /* d_temp_storage_bytes */,
+         cudaError_t * /* d_cdp_error */,
+         void *d_temp_storage,
+         size_t &temp_storage_bytes,
+         InputIteratorT d_in,
+         OutputIteratorT /* d_out */,
+         ScanOpT scan_op,
+         InitialValueT initial_value,
+         OffsetT num_items,
+         cudaStream_t stream,
+         bool debug_synchronous)
 {
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, initial_value, num_items, stream, debug_synchronous);
-    }
-    return error;
+  cudaError_t error = cudaSuccess;
+  for (int i = 0; i < timing_timing_iterations; ++i)
+  {
+    error = DeviceScan::ExclusiveScan(d_temp_storage,
+                                      temp_storage_bytes,
+                                      d_in,
+                                      scan_op,
+                                      initial_value,
+                                      num_items,
+                                      stream,
+                                      debug_synchronous);
+  }
+  return error;
 }
 
+template <typename IsPrimitiveT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ScanOpT,
+          typename InitialValueT,
+          typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+Dispatch(Int2Type<false> /*in_place*/,
+         Int2Type<CUB> /*dispatch_to*/,
+         IsPrimitiveT /*is_primitive*/,
+         int timing_timing_iterations,
+         size_t * /*d_temp_storage_bytes*/,
+         cudaError_t * /*d_cdp_error*/,
+         void *d_temp_storage,
+         size_t &temp_storage_bytes,
+         InputIteratorT d_in,
+         OutputIteratorT d_out,
+         ScanOpT scan_op,
+         InitialValueT initial_value,
+         OffsetT num_items,
+         cudaStream_t stream,
+         bool debug_synchronous)
+{
+  cudaError_t error = cudaSuccess;
+  for (int i = 0; i < timing_timing_iterations; ++i)
+  {
+    error = DeviceScan::ExclusiveScan(d_temp_storage,
+                                      temp_storage_bytes,
+                                      d_in,
+                                      d_out,
+                                      scan_op,
+                                      initial_value,
+                                      num_items,
+                                      stream,
+                                      debug_synchronous);
+  }
+  return error;
+}
 
 /**
  * Dispatch to exclusive sum entrypoint
  */
-template <typename InputIteratorT, typename OutputIteratorT, typename InitialValueT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       /*dispatch_to*/,
-    Int2Type<true>      /*is_primitive*/,
-    int                 timing_timing_iterations,
-    size_t              */*d_temp_storage_bytes*/,
-    cudaError_t         */*d_cdp_error*/,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    Sum                 /*scan_op*/,
-    InitialValueT       /*initial_value*/,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          typename InitialValueT,
+          typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+Dispatch(Int2Type<true> /*in_place*/,
+         Int2Type<CUB> /*dispatch_to*/,
+         Int2Type<true> /*is_primitive*/,
+         int timing_timing_iterations,
+         size_t * /*d_temp_storage_bytes*/,
+         cudaError_t * /*d_cdp_error*/,
+         void *d_temp_storage,
+         size_t &temp_storage_bytes,
+         InputIteratorT d_in,
+         OutputIteratorT /* d_out */,
+         Sum /*scan_op*/,
+         InitialValueT /*initial_value*/,
+         OffsetT num_items,
+         cudaStream_t stream,
+         bool debug_synchronous)
 {
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
-    }
-    return error;
+  cudaError_t error = cudaSuccess;
+  for (int i = 0; i < timing_timing_iterations; ++i)
+  {
+    error = DeviceScan::ExclusiveSum(d_temp_storage,
+                                     temp_storage_bytes,
+                                     d_in,
+                                     num_items,
+                                     stream,
+                                     debug_synchronous);
+  }
+  return error;
 }
 
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          typename InitialValueT,
+          typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+Dispatch(Int2Type<false> /*in_place*/,
+         Int2Type<CUB> /*dispatch_to*/,
+         Int2Type<true> /*is_primitive*/,
+         int timing_timing_iterations,
+         size_t * /*d_temp_storage_bytes*/,
+         cudaError_t * /*d_cdp_error*/,
+         void *d_temp_storage,
+         size_t &temp_storage_bytes,
+         InputIteratorT d_in,
+         OutputIteratorT d_out,
+         Sum /*scan_op*/,
+         InitialValueT /*initial_value*/,
+         OffsetT num_items,
+         cudaStream_t stream,
+         bool debug_synchronous)
+{
+  cudaError_t error = cudaSuccess;
+  for (int i = 0; i < timing_timing_iterations; ++i)
+  {
+    error = DeviceScan::ExclusiveSum(d_temp_storage,
+                                     temp_storage_bytes,
+                                     d_in,
+                                     d_out,
+                                     num_items,
+                                     stream,
+                                     debug_synchronous);
+  }
+  return error;
+}
 
 /**
  * Dispatch to inclusive scan entrypoint
  */
-template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       /*dispatch_to*/,
-    IsPrimitiveT        /*is_primitive*/,
-    int                 timing_timing_iterations,
-    size_t              */*d_temp_storage_bytes*/,
-    cudaError_t         */*d_cdp_error*/,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    ScanOpT             scan_op,
-    NullType            /*initial_value*/,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
+template <typename IsPrimitiveT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ScanOpT,
+          typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+Dispatch(Int2Type<true> /*in_place*/,
+         Int2Type<CUB> /*dispatch_to*/,
+         IsPrimitiveT /*is_primitive*/,
+         int timing_timing_iterations,
+         size_t * /*d_temp_storage_bytes*/,
+         cudaError_t * /*d_cdp_error*/,
+         void *d_temp_storage,
+         size_t &temp_storage_bytes,
+         InputIteratorT d_in,
+         OutputIteratorT /* d_out */,
+         ScanOpT scan_op,
+         NullType /* initial_value */,
+         OffsetT num_items,
+         cudaStream_t stream,
+         bool debug_synchronous)
 {
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, num_items, stream, debug_synchronous);
-    }
-    return error;
+  cudaError_t error = cudaSuccess;
+  for (int i = 0; i < timing_timing_iterations; ++i)
+  {
+    error = DeviceScan::InclusiveScan(d_temp_storage,
+                                      temp_storage_bytes,
+                                      d_in,
+                                      scan_op,
+                                      num_items,
+                                      stream,
+                                      debug_synchronous);
+  }
+  return error;
 }
 
+template <typename IsPrimitiveT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ScanOpT,
+          typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+Dispatch(Int2Type<false> /*in_place*/,
+         Int2Type<CUB> /*dispatch_to*/,
+         IsPrimitiveT /*is_primitive*/,
+         int timing_timing_iterations,
+         size_t * /*d_temp_storage_bytes*/,
+         cudaError_t * /*d_cdp_error*/,
+         void *d_temp_storage,
+         size_t &temp_storage_bytes,
+         InputIteratorT d_in,
+         OutputIteratorT d_out,
+         ScanOpT scan_op,
+         NullType /*initial_value*/,
+         OffsetT num_items,
+         cudaStream_t stream,
+         bool debug_synchronous)
+{
+  cudaError_t error = cudaSuccess;
+  for (int i = 0; i < timing_timing_iterations; ++i)
+  {
+    error = DeviceScan::InclusiveScan(d_temp_storage,
+                                      temp_storage_bytes,
+                                      d_in,
+                                      d_out,
+                                      scan_op,
+                                      num_items,
+                                      stream,
+                                      debug_synchronous);
+  }
+  return error;
+}
 
 /**
  * Dispatch to inclusive sum entrypoint
  */
 template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       /*dispatch_to*/,
-    Int2Type<true>      /*is_primitive*/,
-    int                 timing_timing_iterations,
-    size_t              */*d_temp_storage_bytes*/,
-    cudaError_t         */*d_cdp_error*/,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    Sum                 /*scan_op*/,
-    NullType            /*initial_value*/,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+Dispatch(Int2Type<true> /*in_place*/,
+         Int2Type<CUB> /*dispatch_to*/,
+         Int2Type<true> /*is_primitive*/,
+         int timing_timing_iterations,
+         size_t * /*d_temp_storage_bytes*/,
+         cudaError_t * /*d_cdp_error*/,
+         void *d_temp_storage,
+         size_t &temp_storage_bytes,
+         InputIteratorT d_in,
+         OutputIteratorT /* d_out */,
+         Sum /*scan_op*/,
+         NullType /*initial_value*/,
+         OffsetT num_items,
+         cudaStream_t stream,
+         bool debug_synchronous)
 {
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
-    }
-    return error;
+  cudaError_t error = cudaSuccess;
+  for (int i = 0; i < timing_timing_iterations; ++i)
+  {
+    error = DeviceScan::InclusiveSum(d_temp_storage,
+                                     temp_storage_bytes,
+                                     d_in,
+                                     num_items,
+                                     stream,
+                                     debug_synchronous);
+  }
+  return error;
+}
+
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+Dispatch(Int2Type<false> /*in_place*/,
+         Int2Type<CUB> /*dispatch_to*/,
+         Int2Type<true> /*is_primitive*/,
+         int timing_timing_iterations,
+         size_t * /*d_temp_storage_bytes*/,
+         cudaError_t * /*d_cdp_error*/,
+         void *d_temp_storage,
+         size_t &temp_storage_bytes,
+         InputIteratorT d_in,
+         OutputIteratorT d_out,
+         Sum /*scan_op*/,
+         NullType /*initial_value*/,
+         OffsetT num_items,
+         cudaStream_t stream,
+         bool debug_synchronous)
+{
+  cudaError_t error = cudaSuccess;
+  for (int i = 0; i < timing_timing_iterations; ++i)
+  {
+    error = DeviceScan::InclusiveSum(d_temp_storage,
+                                     temp_storage_bytes,
+                                     d_in,
+                                     d_out,
+                                     num_items,
+                                     stream,
+                                     debug_synchronous);
+  }
+  return error;
 }
 
 //---------------------------------------------------------------------
@@ -216,104 +387,116 @@ cudaError_t Dispatch(
 /**
  * Simple wrapper kernel to invoke DeviceScan
  */
-template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
-__global__ void CnpDispatchKernel(
-    IsPrimitiveT        is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t              temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    ScanOpT             scan_op,
-    InitialValueT       initial_value,
-    OffsetT             num_items,
-    bool                debug_synchronous)
+template <typename IsPrimitiveT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ScanOpT,
+          typename InitialValueT,
+          typename OffsetT,
+          bool InPlace>
+__global__ void CnpDispatchKernel(Int2Type<InPlace> /*in_place*/,
+                                  IsPrimitiveT is_primitive,
+                                  int timing_timing_iterations,
+                                  size_t *d_temp_storage_bytes,
+                                  cudaError_t *d_cdp_error,
+                                  void *d_temp_storage,
+                                  size_t temp_storage_bytes,
+                                  InputIteratorT d_in,
+                                  OutputIteratorT d_out,
+                                  ScanOpT scan_op,
+                                  InitialValueT initial_value,
+                                  OffsetT num_items,
+                                  bool debug_synchronous)
 {
 #ifndef CUB_CDP
-    (void)is_primitive;
-    (void)timing_timing_iterations;
-    (void)d_temp_storage_bytes;
-    (void)d_cdp_error;
-    (void)d_temp_storage;
-    (void)temp_storage_bytes;
-    (void)d_in;
-    (void)d_out;
-    (void)scan_op;
-    (void)initial_value;
-    (void)num_items;
-    (void)debug_synchronous;
-    *d_cdp_error = cudaErrorNotSupported;
+  (void)is_primitive;
+  (void)timing_timing_iterations;
+  (void)d_temp_storage_bytes;
+  (void)d_cdp_error;
+  (void)d_temp_storage;
+  (void)temp_storage_bytes;
+  (void)d_in;
+  (void)d_out;
+  (void)scan_op;
+  (void)initial_value;
+  (void)num_items;
+  (void)debug_synchronous;
+  *d_cdp_error = cudaErrorNotSupported;
 #else
-    *d_cdp_error = Dispatch(
-        Int2Type<CUB>(),
-        is_primitive,
-        timing_timing_iterations,
-        d_temp_storage_bytes,
-        d_cdp_error,
-        d_temp_storage,
-        temp_storage_bytes,
-        d_in,
-        d_out,
-        scan_op,
-        initial_value,
-        num_items,
-        0,
-        debug_synchronous);
-
-    *d_temp_storage_bytes = temp_storage_bytes;
+  *d_cdp_error = Dispatch(Int2Type<CUB>(),
+                          is_primitive,
+                          timing_timing_iterations,
+                          d_temp_storage_bytes,
+                          d_cdp_error,
+                          d_temp_storage,
+                          temp_storage_bytes,
+                          d_in,
+                          d_out,
+                          scan_op,
+                          initial_value,
+                          num_items,
+                          0,
+                          debug_synchronous);
+
+  *d_temp_storage_bytes = temp_storage_bytes;
 #endif
 }
 
-
 /**
  * Dispatch to CDP kernel
  */
-template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
-cudaError_t Dispatch(
-    Int2Type<CDP>       dispatch_to,
-    IsPrimitiveT        is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    ScanOpT             scan_op,
-    InitialValueT       initial_value,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
+template <typename IsPrimitiveT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ScanOpT,
+          typename InitialValueT,
+          typename OffsetT,
+          bool InPlace>
+cudaError_t Dispatch(Int2Type<InPlace> /*in_place*/,
+                     Int2Type<CDP> dispatch_to,
+                     IsPrimitiveT is_primitive,
+                     int timing_timing_iterations,
+                     size_t *d_temp_storage_bytes,
+                     cudaError_t *d_cdp_error,
+                     void *d_temp_storage,
+                     size_t &temp_storage_bytes,
+                     InputIteratorT d_in,
+                     OutputIteratorT d_out,
+                     ScanOpT scan_op,
+                     InitialValueT initial_value,
+                     OffsetT num_items,
+                     cudaStream_t stream,
+                     bool debug_synchronous)
 {
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(
-        is_primitive,
-        timing_timing_iterations,
-        d_temp_storage_bytes,
-        d_cdp_error,
-        d_temp_storage,
-        temp_storage_bytes,
-        d_in,
-        d_out,
-        scan_op,
-        initial_value,
-        num_items,
-        debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
+  // Invoke kernel to invoke device-side dispatch
+  CnpDispatchKernel<<<1, 1>>>(is_primitive,
+                              timing_timing_iterations,
+                              d_temp_storage_bytes,
+                              d_cdp_error,
+                              d_temp_storage,
+                              temp_storage_bytes,
+                              d_in,
+                              d_out,
+                              scan_op,
+                              initial_value,
+                              num_items,
+                              debug_synchronous);
+
+  // Copy out temp_storage_bytes
+  CubDebugExit(cudaMemcpy(&temp_storage_bytes,
+                          d_temp_storage_bytes,
+                          sizeof(size_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  // Copy out error
+  cudaError_t retval;
+  CubDebugExit(cudaMemcpy(&retval,
+                          d_cdp_error,
+                          sizeof(cudaError_t) * 1,
+                          cudaMemcpyDeviceToHost));
+  return retval;
 }
 
-
 //---------------------------------------------------------------------
 // Test generation
 //---------------------------------------------------------------------
@@ -416,7 +599,7 @@ struct AllocateOutput {
 
 template<typename OutputT>
 struct AllocateOutput<OutputT, OutputT *, true> {
-    static void run(OutputT *&d_out, OutputT *d_in, int num_items) {
+    static void run(OutputT *&d_out, OutputT *d_in, int /* num_items */) {
         d_out = d_in;
     }
 };
@@ -454,6 +637,7 @@ void Test(
     void            *d_temp_storage = NULL;
     size_t          temp_storage_bytes = 0;
     CubDebugExit(Dispatch(
+        Int2Type<InPlace>(),
         Int2Type<BACKEND>(),
         Int2Type<Traits<OutputT>::PRIMITIVE>(),
         1,
@@ -470,11 +654,15 @@ void Test(
         true));
     CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
 
-    // Clear device output array
-    CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_items));
+    if (!InPlace)
+    {
+      // Clear device output array
+      CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_items));
+    }
 
     // Run warmup/correctness iteration
     CubDebugExit(Dispatch(
+        Int2Type<InPlace>(),
         Int2Type<BACKEND>(),
         Int2Type<Traits<OutputT>::PRIMITIVE>(),
         1,
@@ -499,39 +687,51 @@ void Test(
     fflush(stderr);
 
     // Performance
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(),
-        Int2Type<Traits<OutputT>::PRIMITIVE>(),
-        g_timing_iterations,
-        d_temp_storage_bytes,
-        d_cdp_error,
-        d_temp_storage,
-        temp_storage_bytes,
-        d_in,
-        d_out,
-        scan_op,
-        initial_value,
-        num_items,
-        0,
-        false));
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    // Display performance
     if (g_timing_iterations > 0)
     {
-        float avg_millis = elapsed_millis / g_timing_iterations;
-        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
-        float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT));
-        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak",
-            avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
+      GpuTimer gpu_timer;
+      gpu_timer.Start();
+      CubDebugExit(Dispatch(Int2Type<InPlace>(),
+                            Int2Type<BACKEND>(),
+                            Int2Type<Traits<OutputT>::PRIMITIVE>(),
+                            g_timing_iterations,
+                            d_temp_storage_bytes,
+                            d_cdp_error,
+                            d_temp_storage,
+                            temp_storage_bytes,
+                            d_in,
+                            d_out,
+                            scan_op,
+                            initial_value,
+                            num_items,
+                            0,
+                            false));
+      gpu_timer.Stop();
+      float elapsed_millis = gpu_timer.ElapsedMillis();
+
+      // Display performance
+      float avg_millis     = elapsed_millis / g_timing_iterations;
+      float giga_rate      = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+      float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT));
+      printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% "
+             "peak",
+             avg_millis,
+             giga_rate,
+             giga_bandwidth,
+             giga_bandwidth / g_device_giga_bandwidth * 100.0);
     }
 
     printf("\n\n");
 
     // Cleanup
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (!InPlace)
+    {
+      if (d_out)
+      {
+        CubDebugExit(g_allocator.DeviceFree(d_out));
+      }
+    }
+
     if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
     if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
     if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
@@ -626,37 +826,30 @@ TestFutureInitValueIter(
     // cub::NullType does not have device pointer, so nothing to do here
 }
 
-template <
-    Backend             BACKEND,
-    typename            DeviceInputIteratorT,
-    typename            OutputT,
-    typename            ScanOpT,
-    typename            InitialValueT>
-auto TestInplace(
-    DeviceInputIteratorT    d_in,
-    OutputT                 *h_reference,
-    int                     num_items,
-    ScanOpT                 scan_op,
-    InitialValueT           initial_value) -> typename std::enable_if<std::is_same<decltype(*d_in), OutputT>::value>::type
+template <Backend BACKEND,
+          typename OutputT,
+          typename ScanOpT,
+          typename InitialValueT>
+auto TestInplace(OutputT *d_in,
+                 OutputT *h_reference,
+                 int num_items,
+                 ScanOpT scan_op,
+                 InitialValueT initial_value)
 {
-    Test<BACKEND, DeviceInputIteratorT, OutputT, ScanOpT, InitialValueT, true>(d_in, h_reference, num_items, scan_op, initial_value);
+  Test<BACKEND, OutputT *, OutputT, ScanOpT, InitialValueT, true>(d_in,
+                                                                  h_reference,
+                                                                  num_items,
+                                                                  scan_op,
+                                                                  initial_value);
 }
 
-template <
-    Backend             BACKEND,
-    typename            DeviceInputIteratorT,
-    typename            OutputT,
-    typename            ScanOpT,
-    typename            InitialValueT>
-auto TestInplace(
-    DeviceInputIteratorT    d_in,
-    OutputT *,
-    int,
-    ScanOpT,
-    InitialValueT) -> typename std::enable_if<!std::is_same<decltype(*d_in), OutputT>::value>::type
-{
-  (void)d_in;
-}
+template <Backend BACKEND,
+          typename DeviceInputIteratorT,
+          typename OutputT,
+          typename ScanOpT,
+          typename InitialValueT>
+auto TestInplace(DeviceInputIteratorT, OutputT *, int, ScanOpT, InitialValueT)
+{}
 
 /**
  * Test DeviceScan on pointer type
@@ -946,5 +1139,3 @@ int main(int argc, char** argv)
     return 0;
 }
 
-
-
diff --git a/test/test_device_scan_by_key.cu b/test/test_device_scan_by_key.cu
index c5273befda..ba1d548675 100644
--- a/test/test_device_scan_by_key.cu
+++ b/test/test_device_scan_by_key.cu
@@ -64,6 +64,14 @@ enum Backend
 };
 
 
+enum AliasMode
+{
+  AliasNone,  // output is allocated
+  AliasKeys,  // output is an alias of input keys
+  AliasValues // output is an alias of input values
+};
+
+
 /**
  * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
  */
@@ -473,7 +481,7 @@ struct AllocateOutput {
 
 template<typename OutputT>
 struct AllocateOutput<OutputT, OutputT *, true> {
-    static void run(OutputT *&d_out, OutputT *d_in, int num_items) {
+    static void run(OutputT *&d_out, OutputT *d_in, int /* num_items */) {
         d_out = d_in;
     }
 };
@@ -489,7 +497,7 @@ template <
     typename            ScanOpT,
     typename            InitialValueT,
     typename            EqualityOpT,
-    bool                InPlace=false>
+    AliasMode           Mode=AliasNone>
 void Test(
     KeysInputIteratorT      d_keys_in,
     ValuesInputIteratorT    d_values_in,
@@ -504,7 +512,21 @@ void Test(
 
     // Allocate device output array
     OutputT *d_values_out = NULL;
-    AllocateOutput<OutputT, ValuesInputIteratorT, InPlace>::run(d_values_out, d_values_in, num_items);
+
+    if (Mode == AliasKeys)
+    {
+      AllocateOutput<OutputT, KeysInputIteratorT, Mode == AliasKeys>::run(
+        d_values_out,
+        d_keys_in,
+        num_items);
+    }
+    else
+    {
+      AllocateOutput<OutputT, ValuesInputIteratorT, Mode == AliasValues>::run(
+        d_values_out,
+        d_values_in,
+        num_items);
+    }
 
     // Allocate CDP device arrays
     size_t          *d_temp_storage_bytes = NULL;
@@ -535,7 +557,10 @@ void Test(
     CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
 
     // Clear device output array
-    CubDebugExit(cudaMemset(d_values_out, 0, sizeof(OutputT) * num_items));
+    if (Mode == AliasNone)
+    {
+      CubDebugExit(cudaMemset(d_values_out, 0, sizeof(OutputT) * num_items));
+    }
 
     // Run warmup/correctness iteration
     CubDebugExit(Dispatch(
@@ -557,38 +582,43 @@ void Test(
         true));
 
     // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference, d_values_out, num_items, true, g_verbose);
+    const int compare = CompareDeviceResults(h_reference,
+                                             d_values_out,
+                                             num_items,
+                                             true,
+                                             g_verbose);
+
     printf("\t%s", compare ? "FAIL" : "PASS");
 
     // Flush any stdout/stderr
     fflush(stdout);
     fflush(stderr);
 
-    // Performance
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(),
-        Int2Type<Traits<OutputT>::PRIMITIVE>(),
-        g_timing_iterations,
-        d_temp_storage_bytes,
-        d_cdp_error,
-        d_temp_storage,
-        temp_storage_bytes,
-        d_keys_in,
-        d_values_in,
-        d_values_out,
-        scan_op,
-        initial_value,
-        num_items,
-        equality_op,
-        0,
-        false));
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
     // Display performance
     if (g_timing_iterations > 0)
     {
+      // Performance
+      GpuTimer gpu_timer;
+      gpu_timer.Start();
+      CubDebugExit(Dispatch(Int2Type<BACKEND>(),
+          Int2Type<Traits<OutputT>::PRIMITIVE>(),
+          g_timing_iterations,
+          d_temp_storage_bytes,
+          d_cdp_error,
+          d_temp_storage,
+          temp_storage_bytes,
+          d_keys_in,
+          d_values_in,
+          d_values_out,
+          scan_op,
+          initial_value,
+          num_items,
+          equality_op,
+          0,
+          false));
+
+      gpu_timer.Stop();
+      float elapsed_millis = gpu_timer.ElapsedMillis();
         float avg_millis = elapsed_millis / g_timing_iterations;
         float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
         float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT));
@@ -599,7 +629,14 @@ void Test(
     printf("\n\n");
 
     // Cleanup
-    if (d_values_out) CubDebugExit(g_allocator.DeviceFree(d_values_out));
+    if (Mode == AliasNone)
+    {
+      if (d_values_out) 
+      {
+        CubDebugExit(g_allocator.DeviceFree(d_values_out));
+      }
+    }
+
     if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
     if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
     if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
@@ -608,46 +645,98 @@ void Test(
     AssertEquals(0, compare);
 }
 
-template <
-    Backend             BACKEND,
-    typename            KeysInputIteratorT,
-    typename            ValuesInputIteratorT,
-    typename            OutputT,
-    typename            ScanOpT,
-    typename            InitialValueT,
-    typename            EqualityOpT>
-auto TestInplace(
-    KeysInputIteratorT      d_keys_in,
-    ValuesInputIteratorT    d_values_in,
-    OutputT                 *h_reference,
-    int                     num_items,
-    ScanOpT                 scan_op,
-    InitialValueT           initial_value,
-    EqualityOpT             equality_op) -> typename std::enable_if<std::is_same<decltype(*d_values_in), OutputT>::value>::type
+template <Backend BACKEND,
+          typename KeysInputIteratorT,
+          typename OutputT,
+          typename ScanOpT,
+          typename InitialValueT,
+          typename EqualityOpT>
+auto TestInplaceValues(KeysInputIteratorT d_keys_in,
+                       OutputT *d_values_in,
+                       OutputT *h_reference,
+                       int num_items,
+                       ScanOpT scan_op,
+                       InitialValueT initial_value,
+                       EqualityOpT equality_op)
 {
-    Test<BACKEND, KeysInputIteratorT, ValuesInputIteratorT, OutputT, ScanOpT, InitialValueT, true>(d_keys_in, d_values_in, h_reference, num_items, scan_op, initial_value, equality_op);
+  Test<BACKEND,
+       KeysInputIteratorT,
+       OutputT *,
+       OutputT,
+       ScanOpT,
+       InitialValueT,
+       EqualityOpT,
+       AliasValues>(d_keys_in,
+                    d_values_in,
+                    h_reference,
+                    num_items,
+                    scan_op,
+                    initial_value,
+                    equality_op);
 }
 
-template <
-    Backend             BACKEND,
-    typename            KeysInputIteratorT,
-    typename            ValuesInputIteratorT,
-    typename            OutputT,
-    typename            ScanOpT,
-    typename            InitialValueT,
-    typename            EqualityOpT>
-auto TestInplace(
-    KeysInputIteratorT,
-    ValuesInputIteratorT d_values_in,
-    OutputT *,
-    int,
-    ScanOpT,
-    InitialValueT,
-    EqualityOpT) -> typename std::enable_if<!std::is_same<decltype(*d_values_in), OutputT>::value>::type
+template <Backend BACKEND,
+          typename KeysInputIteratorT,
+          typename ValuesInputIteratorT,
+          typename OutputT,
+          typename ScanOpT,
+          typename InitialValueT,
+          typename EqualityOpT>
+auto TestInplaceValues(KeysInputIteratorT,
+                       ValuesInputIteratorT,
+                       OutputT *,
+                       int,
+                       ScanOpT,
+                       InitialValueT,
+                       EqualityOpT)
+{}
+
+template <Backend BACKEND,
+          typename T,
+          typename ValuesInputIteratorT,
+          typename ScanOpT,
+          typename InitialValueT,
+          typename EqualityOpT>
+auto TestInplaceKeys(T *d_keys_in,
+                     ValuesInputIteratorT d_values_in,
+                     T *h_reference,
+                     int num_items,
+                     ScanOpT scan_op,
+                     InitialValueT initial_value,
+                     EqualityOpT equality_op)
 {
-  (void)d_values_in;
+  Test<BACKEND,
+       T *,
+       ValuesInputIteratorT,
+       T,
+       ScanOpT,
+       InitialValueT,
+       EqualityOpT,
+       AliasKeys>(d_keys_in,
+                  d_values_in,
+                  h_reference,
+                  num_items,
+                  scan_op,
+                  initial_value,
+                  equality_op);
 }
 
+template <Backend BACKEND,
+          typename KeysInputIteratorT,
+          typename ValuesInputIteratorT,
+          typename OutputT,
+          typename ScanOpT,
+          typename InitialValueT,
+          typename EqualityOpT>
+auto TestInplaceKeys(KeysInputIteratorT,
+                     ValuesInputIteratorT,
+                     OutputT *,
+                     int,
+                     ScanOpT,
+                     InitialValueT,
+                     EqualityOpT)
+{}
+
 /**
  * Test DeviceScan on pointer type
  */
@@ -676,7 +765,7 @@ void TestPointer(
     fflush(stdout);
 
     // Allocate host arrays
-    KeyT*     h_keys_in   = new KeyT[num_items];
+    KeyT*       h_keys_in   = new KeyT[num_items];
     InputT*     h_values_in = new InputT[num_items];
     OutputT*    h_reference = new OutputT[num_items];
 
@@ -711,8 +800,33 @@ void TestPointer(
     CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
 
     // Run Test
-    Test<BACKEND>(d_keys_in, d_values_in, h_reference, num_items, scan_op, initial_value, equality_op);
-    TestInplace<BACKEND>(d_keys_in, d_values_in, h_reference, num_items, scan_op, initial_value, equality_op);
+    Test<BACKEND>(d_keys_in,
+                  d_values_in,
+                  h_reference,
+                  num_items,
+                  scan_op,
+                  initial_value,
+                  equality_op);
+
+    // Test in/out values aliasing
+    TestInplaceValues<BACKEND>(d_keys_in,
+                               d_values_in, 
+                               h_reference,
+                               num_items,
+                               scan_op,
+                               initial_value,
+                               equality_op);
+
+    CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
+
+    // Test keys/values aliasing (should go last, changes keys)
+    TestInplaceKeys<BACKEND>(d_keys_in,
+                             d_values_in,
+                             h_reference,
+                             num_items,
+                             scan_op,
+                             initial_value,
+                             equality_op);
 
     // Cleanup
     if (h_keys_in) delete[] h_keys_in;
@@ -850,7 +964,7 @@ void TestKeyTAndEqualityOp(
     OutputT         initial_value)
 {
     TestOp<bool, InputT>(num_items, identity, initial_value, Equality());
-    TestOp<int, InputT>( num_items, identity, initial_value, Mod2Equality());
+    TestOp<unsigned int, InputT>( num_items, identity, initial_value, Mod2Equality());
 }
 
 /**
@@ -933,7 +1047,6 @@ int main(int argc, char** argv)
     TestSize<unsigned long long>(num_items,
                                  (unsigned long long)0,
                                  (unsigned long long)99);
-
 #elif TEST_VALUE_TYPES == 2
 
     TestSize<uchar2>(num_items, make_uchar2(0, 0), make_uchar2(17, 21));