From b00f714a11df33b5f24bc2ad14aaebda71446413 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Tue, 17 May 2022 17:47:08 +0400 Subject: [PATCH] In-place guarantees for scan algorithms --- cub/agent/agent_scan_by_key.cuh | 37 +- cub/device/device_scan.cuh | 2765 ++++++++++++------ cub/device/dispatch/dispatch_scan.cuh | 10 +- cub/device/dispatch/dispatch_scan_by_key.cuh | 42 +- test/test_device_scan.cu | 671 +++-- test/test_device_scan_by_key.cu | 247 +- 6 files changed, 2624 insertions(+), 1148 deletions(-) diff --git a/cub/agent/agent_scan_by_key.cuh b/cub/agent/agent_scan_by_key.cuh index 68575a6370..8e25c48af4 100644 --- a/cub/agent/agent_scan_by_key.cuh +++ b/cub/agent/agent_scan_by_key.cuh @@ -158,6 +158,7 @@ struct AgentScanByKey TempStorage & storage; WrappedKeysInputIteratorT d_keys_in; + KeyT* d_keys_prev_in; WrappedValuesInputIteratorT d_values_in; ValuesOutputIteratorT d_values_out; InequalityWrapper inequality_op; @@ -364,19 +365,27 @@ struct AgentScanByKey } else { - KeyT tile_pred_key = (threadIdx.x == 0) ? d_keys_in[tile_base - 1] : KeyT(); - BlockDiscontinuityKeysT(storage.scan_storage.discontinuity) - .FlagHeads(segment_flags, keys, inequality_op, tile_pred_key); - - // Zip values and segment_flags - ZipValuesAndFlags(num_remaining, - values, - segment_flags, - scan_items); - - SizeValuePairT tile_aggregate; - TilePrefixCallbackT prefix_op(tile_state, storage.scan_storage.prefix, pair_scan_op, tile_idx); - ScanTile(scan_items, tile_aggregate, prefix_op, Int2Type()); + KeyT tile_pred_key = (threadIdx.x == 0) ? d_keys_prev_in[tile_idx] + : KeyT(); + + BlockDiscontinuityKeysT(storage.scan_storage.discontinuity) + .FlagHeads(segment_flags, keys, inequality_op, tile_pred_key); + + // Zip values and segment_flags + ZipValuesAndFlags(num_remaining, + values, + segment_flags, + scan_items); + + SizeValuePairT tile_aggregate; + TilePrefixCallbackT prefix_op(tile_state, + storage.scan_storage.prefix, + pair_scan_op, + tile_idx); + ScanTile(scan_items, + tile_aggregate, + prefix_op, + Int2Type()); } CTA_SYNC(); @@ -408,6 +417,7 @@ struct AgentScanByKey AgentScanByKey( TempStorage & storage, KeysInputIteratorT d_keys_in, + KeyT * d_keys_prev_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, EqualityOp equality_op, @@ -416,6 +426,7 @@ struct AgentScanByKey : storage(storage), d_keys_in(d_keys_in), + d_keys_prev_in(d_keys_prev_in), d_values_in(d_values_in), d_values_out(d_values_out), inequality_op(equality_op), diff --git a/cub/device/device_scan.cuh b/cub/device/device_scan.cuh index 5d63d96244..c688515ff7 100644 --- a/cub/device/device_scan.cuh +++ b/cub/device/device_scan.cuh @@ -1,7 +1,6 @@ - /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -14,10 +13,10 @@ * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND @@ -28,844 +27,1974 @@ ******************************************************************************/ /** - * \file - * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. + * @file cub::DeviceScan provides device-wide, parallel operations for + * computing a prefix scan across a sequence of data items residing + * within device-accessible memory. */ #pragma once -#include -#include +#include +#include +#include +#include -#include "../config.cuh" -#include "../thread/thread_operators.cuh" -#include "dispatch/dispatch_scan.cuh" -#include "dispatch/dispatch_scan_by_key.cuh" CUB_NAMESPACE_BEGIN /** - * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. ![](device_scan.png) - * \ingroup SingleModule + * @brief DeviceScan provides device-wide, parallel operations for computing a + * prefix scan across a sequence of data items residing within + * device-accessible memory. ![](device_scan.png) + * + * @ingroup SingleModule * - * \par Overview - * Given a sequence of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) - * produces an output sequence where each element is computed to be the reduction - * of the elements occurring earlier in the input sequence. Prefix sum - * connotes a prefix scan with the addition operator. The term \em inclusive indicates - * that the ith output reduction incorporates the ith input. - * The term \em exclusive indicates the ith input is not incorporated into - * the ith output reduction. + * @par Overview + * Given a sequence of input elements and a binary reduction operator, a + * [*prefix scan*](http://en.wikipedia.org/wiki/Prefix_sum) produces an output + * sequence where each element is computed to be the reduction of the elements + * occurring earlier in the input sequence. *Prefix sum* connotes a prefix scan + * with the addition operator. The term *inclusive* indicates that the + * *i*th output reduction incorporates the *i*th input. + * The term *exclusive* indicates the *i*th input is not + * incorporated into the *i*th output reduction. When the input and + * output sequences are the same, the scan is performed in-place. * - * \par - * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our "decoupled look-back" algorithm - * for performing global prefix scan with only a single pass through the - * input data, as described in our 2016 technical report [1]. The central - * idea is to leverage a small, constant factor of redundant work in order to overlap the latencies - * of global prefix propagation with local computation. As such, our algorithm requires only - * ~2n data movement (n inputs are read, n outputs are written), and typically - * proceeds at "memcpy" speeds. Our algorithm supports inplace operations. + * @par + * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our + * *"decoupled look-back"* algorithm for performing global prefix scan with + * only a single pass through the input data, as described in our 2016 technical + * report [1]. The central idea is to leverage a small, constant factor of + * redundant work in order to overlap the latencies of global prefix + * propagation with local computation. As such, our algorithm requires only + * ~2*n* data movement (*n* inputs are read, *n* outputs are written), and + * typically proceeds at "memcpy" speeds. Our algorithm supports inplace + * operations. * - * \par + * @par * [1] [Duane Merrill and Michael Garland. "Single-pass Parallel Prefix Scan with Decoupled Look-back", NVIDIA Technical Report NVR-2016-002, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back) * - * \par Usage Considerations - * \cdp_class{DeviceScan} + * @par Usage Considerations + * @cdp_class{DeviceScan} * - * \par Performance - * \linear_performance{prefix scan} + * @par Performance + * @linear_performance{prefix scan} * - * \par - * The following chart illustrates DeviceScan::ExclusiveSum - * performance across different CUDA architectures for \p int32 keys. - * \plots_below + * @par + * The following chart illustrates DeviceScan::ExclusiveSum performance across + * different CUDA architectures for `int32` keys. + * @plots_below * - * \image html scan_int32.png + * @image html scan_int32.png * */ struct DeviceScan { - /******************************************************************//** - * \name Exclusive scans - *********************************************************************/ - //@{ - - /** - * \brief Computes a device-wide exclusive prefix sum. The value of 0 is applied as the initial value, and is assigned to *d_out. - * - * \par - * - Supports non-commutative sum operators. - * - Results are not deterministic for pseudo-associative operators (e.g., - * addition of floating-point types). Results for pseudo-associative - * operators may vary from run to run. Additional details can be found in - * the [decoupled look-back] description. - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated exclusive sum performance across different - * CUDA architectures for \p int32 and \p int64 items, respectively. - * - * \image html scan_int32.png - * \image html scan_int64.png - * - * \par Snippet - * The code snippet below illustrates the exclusive prefix sum of an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run exclusive prefix sum - * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // d_out s<-- [0, 8, 14, 21, 26, 29, 29] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t ExclusiveSum( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Random-access iterator to the input sequence of data items - OutputIteratorT d_out, ///< [out] Random-access iterator to the output sequence of data items - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - using OffsetT = int; - - // The output value type -- used as the intermediate accumulator - // Use the input value type per https://wg21.link/P0571 - using OutputT = cub::detail::value_t; - - // Initial value - OutputT init_value = 0; - - return DispatchScan, OffsetT>::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - Sum(), - detail::InputValue(init_value), - num_items, - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor. The \p init_value value is applied as the initial value, and is assigned to *d_out. - * - * \par - * - Supports non-commutative scan operators. - * - Results are not deterministic for pseudo-associative operators (e.g., - * addition of floating-point types). Results for pseudo-associative - * operators may vary from run to run. Additional details can be found in - * the [decoupled look-back] description. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector - * \par - * \code - * #include // or equivalently - * #include // for INT_MAX - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * CustomMin min_op; - * ... - * - * // Determine temporary device storage requirements for exclusive prefix scan - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) INT_MAX, num_items); - * - * // Allocate temporary storage for exclusive prefix scan - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run exclusive prefix min-scan - * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) INT_MAX, num_items); - * - * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam InitValueT [inferred] Type of the \p init_value used Binary scan functor type having member T operator()(const T &a, const T &b) - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename ScanOpT, - typename InitValueT> - CUB_RUNTIME_FUNCTION - static cudaError_t ExclusiveScan( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Random-access iterator to the input sequence of data items - OutputIteratorT d_out, ///< [out] Random-access iterator to the output sequence of data items - ScanOpT scan_op, ///< [in] Binary scan functor - InitValueT init_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out) - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchScan, OffsetT>::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - detail::InputValue(init_value), - num_items, - stream, - debug_synchronous); - } - - template < - typename InputIteratorT, - typename OutputIteratorT, - typename ScanOpT, - typename InitValueT, - typename InitValueIterT=InitValueT*> - CUB_RUNTIME_FUNCTION - static cudaError_t ExclusiveScan( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items - ScanOpT scan_op, ///< [in] Binary scan functor - FutureValue init_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out) - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchScan, OffsetT>::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - detail::InputValue(init_value), - num_items, - stream, - debug_synchronous); - } - - - //@} end member group - /******************************************************************//** - * \name Inclusive scans - *********************************************************************/ - //@{ - - - /** - * \brief Computes a device-wide inclusive prefix sum. - * - * \par - * - Supports non-commutative sum operators. - * - Results are not deterministic for pseudo-associative operators (e.g., - * addition of floating-point types). Results for pseudo-associative - * operators may vary from run to run. Additional details can be found in - * the [decoupled look-back] description. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the inclusive prefix sum of an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * ... - * - * // Determine temporary device storage requirements for inclusive prefix sum - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // Allocate temporary storage for inclusive prefix sum - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run inclusive prefix sum - * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // d_out <-- [8, 14, 21, 26, 29, 29, 38] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t InclusiveSum( - void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Random-access iterator to the input sequence of data items - OutputIteratorT d_out, ///< [out] Random-access iterator to the output sequence of data items - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchScan::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - Sum(), - NullType(), - num_items, - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor. - * - * \par - * - Supports non-commutative scan operators. - * - Results are not deterministic for pseudo-associative operators (e.g., - * addition of floating-point types). Results for pseudo-associative - * operators may vary from run to run. Additional details can be found in - * the [decoupled look-back] description. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector. - * \par - * \code - * #include // or equivalently - * #include // for INT_MAX - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * CustomMin min_op; - * ... - * - * // Determine temporary device storage requirements for inclusive prefix scan - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items); - * - * // Allocate temporary storage for inclusive prefix scan - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run inclusive prefix min-scan - * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items); - * - * // d_out <-- [8, 6, 6, 5, 3, 0, 0] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename ScanOpT> - CUB_RUNTIME_FUNCTION - static cudaError_t InclusiveScan( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Random-access iterator to the input sequence of data items - OutputIteratorT d_out, ///< [out] Random-access iterator to the output sequence of data items - ScanOpT scan_op, ///< [in] Binary scan functor - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchScan::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - NullType(), - num_items, - stream, - debug_synchronous); - } - - /** - * \brief Computes a device-wide exclusive prefix sum-by-key with key equality - * defined by \p equality_op . The value of 0 is applied as the initial value, - * and is assigned to the beginning of each segment in \p d_values_out . - * - * \par - * - Supports non-commutative sum operators. - * - Results are not deterministic for pseudo-associative operators (e.g., - * addition of floating-point types). Results for pseudo-associative - * operators may vary from run to run. Additional details can be found in - * the [decoupled look-back] description. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the exclusive prefix sum-by-key of an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] - * int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_values_out; // e.g., [ , , , , , , ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::ExclusiveSumByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run exclusive prefix sum - * cub::DeviceScan::ExclusiveSumByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items); - * - * // d_values_out <-- [0, 8, 0, 7, 12, 0, 0] - * - * \endcode - * - * \tparam KeysInputIteratorT [inferred] Random-access input iterator type for reading scan keys inputs \iterator - * \tparam ValuesInputIteratorT [inferred] Random-access input iterator type for reading scan values inputs \iterator - * \tparam ValuesOutputIteratorT [inferred] Random-access output iterator type for writing scan values outputs \iterator - * \tparam EqualityOpT [inferred] Functor type having member T operator()(const T &a, const T &b) for binary operations that defines the equality of keys - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ - template < - typename KeysInputIteratorT, - typename ValuesInputIteratorT, - typename ValuesOutputIteratorT, - typename EqualityOpT = Equality> - CUB_RUNTIME_FUNCTION - static cudaError_t ExclusiveSumByKey( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - KeysInputIteratorT d_keys_in, ///< [in] Random-access input iterator to the input sequence of key items - ValuesInputIteratorT d_values_in, ///< [in] Random-access input iterator to the input sequence of value items - ValuesOutputIteratorT d_values_out, ///< [out] Random-access output iterator to the output sequence of value items - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_keys_in and \p d_values_in) - EqualityOpT equality_op = EqualityOpT(), ///< [in] Binary functor that defines the equality of keys. Default is cub::Equality(). - cudaStream_t stream=0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous=false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - using OffsetT = int; - - // The output value type -- used as the intermediate accumulator - // Use the input value type per https://wg21.link/P0571 - using OutputT = cub::detail::value_t; - - // Initial value - OutputT init_value = 0; - - return DispatchScanByKey< - KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT, Sum, OutputT, OffsetT> - ::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_values_in, - d_values_out, - equality_op, - Sum(), - init_value, - num_items, - stream, - debug_synchronous); - } - - /** - * \brief Computes a device-wide exclusive prefix scan-by-key using the specified binary \p scan_op functor. - * The key equality is defined by \p equality_op . The \p init_value value is applied as the initial value, - * and is assigned to the beginning of each segment in \p d_values_out . - * - * \par - * - Supports non-commutative scan operators. - * - Results are not deterministic for pseudo-associative operators (e.g., - * addition of floating-point types). Results for pseudo-associative - * operators may vary from run to run. Additional details can be found in - * the [decoupled look-back] description. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the exclusive prefix min-scan-by-key of an \p int device vector - * \par - * \code - * #include // or equivalently - * #include // for INT_MAX - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // CustomEqual functor - * struct CustomEqual - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return a == b; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] - * int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_values_out; // e.g., [ , , , , , , ] - * CustomMin min_op; - * CustomEqual equality_op; - * ... - * - * // Determine temporary device storage requirements for exclusive prefix scan - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::ExclusiveScanByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, min_op, (int) INT_MAX, num_items, equality_op); - * - * // Allocate temporary storage for exclusive prefix scan - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run exclusive prefix min-scan - * cub::DeviceScan::ExclusiveScanByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, min_op, (int) INT_MAX, num_items, equality_op); - * - * // d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0] - * - * \endcode - * - * \tparam KeysInputIteratorT [inferred] Random-access input iterator type for reading scan keys inputs \iterator - * \tparam ValuesInputIteratorT [inferred] Random-access input iterator type for reading scan values inputs \iterator - * \tparam ValuesOutputIteratorT [inferred] Random-access output iterator type for writing scan values outputs \iterator - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam InitValueT [inferred] Type of the \p init_value value used in Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam EqualityOpT [inferred] Functor type having member T operator()(const T &a, const T &b) for binary operations that defines the equality of keys - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ - template < - typename KeysInputIteratorT, - typename ValuesInputIteratorT, - typename ValuesOutputIteratorT, - typename ScanOpT, - typename InitValueT, - typename EqualityOpT = Equality> - CUB_RUNTIME_FUNCTION - static cudaError_t ExclusiveScanByKey( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - KeysInputIteratorT d_keys_in, ///< [in] Random-access input iterator to the input sequence of key items - ValuesInputIteratorT d_values_in, ///< [in] Random-access input iterator to the input sequence of value items - ValuesOutputIteratorT d_values_out, ///< [out] Random-access output iterator to the output sequence of value items - ScanOpT scan_op, ///< [in] Binary scan functor - InitValueT init_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to the beginning of each segment in \p d_values_out) - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_keys_in and \p d_values_in) - EqualityOpT equality_op = EqualityOpT(), ///< [in] Binary functor that defines the equality of keys. Default is cub::Equality(). - cudaStream_t stream=0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous=false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchScanByKey< - KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT, ScanOpT, InitValueT, OffsetT> - ::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_values_in, - d_values_out, - equality_op, - scan_op, - init_value, - num_items, - stream, - debug_synchronous); - } - - /** - * \brief Computes a device-wide inclusive prefix sum-by-key with key equality defined by \p equality_op . - * - * \par - * - Supports non-commutative sum operators. - * - Results are not deterministic for pseudo-associative operators (e.g., - * addition of floating-point types). Results for pseudo-associative - * operators may vary from run to run. Additional details can be found in - * the [decoupled look-back] description. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the inclusive prefix sum-by-key of an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] - * int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_values_out; // e.g., [ , , , , , , ] - * ... - * - * // Determine temporary device storage requirements for inclusive prefix sum - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::InclusiveSumByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items); - * - * // Allocate temporary storage for inclusive prefix sum - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run inclusive prefix sum - * cub::DeviceScan::InclusiveSumByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items); - * - * // d_out <-- [8, 14, 7, 12, 15, 0, 9] - * - * \endcode - * - * \tparam KeysInputIteratorT [inferred] Random-access input iterator type for reading scan keys inputs \iterator - * \tparam ValuesInputIteratorT [inferred] Random-access input iterator type for reading scan values inputs \iterator - * \tparam ValuesOutputIteratorT [inferred] Random-access output iterator type for writing scan values outputs \iterator - * \tparam EqualityOpT [inferred] Functor type having member T operator()(const T &a, const T &b) for binary operations that defines the equality of keys - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ - template < - typename KeysInputIteratorT, - typename ValuesInputIteratorT, - typename ValuesOutputIteratorT, - typename EqualityOpT = Equality> - CUB_RUNTIME_FUNCTION - static cudaError_t InclusiveSumByKey( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - KeysInputIteratorT d_keys_in, ///< [in] Random-access input iterator to the input sequence of key items - ValuesInputIteratorT d_values_in, ///< [in] Random-access input iterator to the input sequence of value items - ValuesOutputIteratorT d_values_out, ///< [out] Random-access output iterator to the output sequence of value items - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_keys_in and \p d_values_in) - EqualityOpT equality_op = EqualityOpT(), ///< [in] Binary functor that defines the equality of keys. Default is cub::Equality(). - cudaStream_t stream=0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous=false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchScanByKey< - KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT, Sum, NullType, OffsetT> - ::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_values_in, - d_values_out, - equality_op, - Sum(), - NullType(), - num_items, - stream, - debug_synchronous); - } - - /** - * \brief Computes a device-wide inclusive prefix scan-by-key using the specified binary \p scan_op functor. - * The key equality is defined by \p equality_op . - * - * \par - * - Supports non-commutative scan operators. - * - Results are not deterministic for pseudo-associative operators (e.g., - * addition of floating-point types). Results for pseudo-associative - * operators may vary from run to run. Additional details can be found in - * the [decoupled look-back] description. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the inclusive prefix min-scan-by-key of an \p int device vector. - * \par - * \code - * #include // or equivalently - * #include // for INT_MAX - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // CustomEqual functor - * struct CustomEqual - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return a == b; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] - * int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_values_out; // e.g., [ , , , , , , ] - * CustomMin min_op; - * CustomEqual equality_op; - * ... - * - * // Determine temporary device storage requirements for inclusive prefix scan - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::InclusiveScanByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op); - * - * // Allocate temporary storage for inclusive prefix scan - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run inclusive prefix min-scan - * cub::DeviceScan::InclusiveScanByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op); - * - * // d_out <-- [8, 6, 7, 5, 3, 0, 0] - * - * \endcode - * - * \tparam KeysInputIteratorT [inferred] Random-access input iterator type for reading scan keys inputs \iterator - * \tparam ValuesInputIteratorT [inferred] Random-access input iterator type for reading scan values inputs \iterator - * \tparam ValuesOutputIteratorT [inferred] Random-access output iterator type for writing scan values outputs \iterator - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam EqualityOpT [inferred] Functor type having member T operator()(const T &a, const T &b) for binary operations that defines the equality of keys - * - * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back - */ - template < - typename KeysInputIteratorT, - typename ValuesInputIteratorT, - typename ValuesOutputIteratorT, - typename ScanOpT, - typename EqualityOpT = Equality> - CUB_RUNTIME_FUNCTION - static cudaError_t InclusiveScanByKey( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - KeysInputIteratorT d_keys_in, ///< [in] Random-access input iterator to the input sequence of key items - ValuesInputIteratorT d_values_in, ///< [in] Random-access input iterator to the input sequence of value items - ValuesOutputIteratorT d_values_out, ///< [out] Random-access output iterator to the output sequence of value items - ScanOpT scan_op, ///< [in] Binary scan functor - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_keys_in and \p d_values_in) - EqualityOpT equality_op = EqualityOpT(), ///< [in] Binary functor that defines the equality of keys. Default is cub::Equality(). - cudaStream_t stream=0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous=false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchScanByKey< - KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT, ScanOpT, NullType, OffsetT> - ::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_values_in, - d_values_out, - equality_op, - scan_op, - NullType(), - num_items, - stream, - debug_synchronous); - } - - //@} end member group + /******************************************************************//** + * \name Exclusive scans + *********************************************************************/ + //@{ + + /** + * @brief Computes a device-wide exclusive prefix sum. The value of `0` is + * applied as the initial value, and is assigned to `*d_out`. + * + * @par + * - Supports non-commutative sum operators. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. + * - When the input and output sequences are the same, the scan is + * performed in-place. + * - When `d_in` and `d_out` are equal, the scan is performed in-place. The + * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` + * shall not overlap in any other way. + * - @devicestorage + * + * @par Performance + * The following charts illustrate saturated exclusive sum performance across + * different CUDA architectures for `int32` and `int64` items, respectively. + * + * @image html scan_int32.png + * @image html scan_int64.png + * + * @par Snippet + * The code snippet below illustrates the exclusive prefix sum of an `int` + * device vector. + * @par + * @code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for + * // input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::ExclusiveSum( + * d_temp_storage, temp_storage_bytes, + * d_in, d_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run exclusive prefix sum + * cub::DeviceScan::ExclusiveSum( + * d_temp_storage, temp_storage_bytes, + * d_in, d_out, num_items); + * + * // d_out <-- [0, 8, 14, 21, 26, 29, 29] + * + * @endcode + * + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading scan + * inputs \iterator + * + * @tparam OutputIteratorT + * **[inferred]** Random-access output iterator type for writing scan + * outputs \iterator + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no + * work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_in + * Random-access iterator to the input sequence of data items + * + * @param[out] d_out + * Random-access iterator to the output sequence of data items + * + * @param[in] num_items + * Total number of input items (i.e., the length of `d_in`) + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. May cause significant slowdown. + * Default is `false`. + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t + ExclusiveSum(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_items, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int; + + // The output value type -- used as the intermediate accumulator + // Use the input value type per https://wg21.link/P0571 + using OutputT = cub::detail::value_t; + + // Initial value + OutputT init_value = 0; + + return DispatchScan< + InputIteratorT, OutputIteratorT, Sum, detail::InputValue, + OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, + Sum(), detail::InputValue(init_value), + num_items, stream, debug_synchronous); + } + + /** + * @brief Computes a device-wide exclusive prefix sum in-place. The value of + * `0` is applied as the initial value, and is assigned to `*d_data`. + * + * @par + * - Supports non-commutative sum operators. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. + * - When the input and output sequences are the same, the scan is + * performed in-place. + * - @devicestorage + * + * @par Performance + * The following charts illustrate saturated exclusive sum performance across + * different CUDA architectures for `int32` and `int64` items, respectively. + * + * @image html scan_int32.png + * @image html scan_int64.png + * + * @par Snippet + * The code snippet below illustrates the exclusive prefix sum of an `int` + * device vector. + * @par + * @code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for + * // input and output + * int num_items; // e.g., 7 + * int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::ExclusiveSum( + * d_temp_storage, temp_storage_bytes, + * d_data, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run exclusive prefix sum + * cub::DeviceScan::ExclusiveSum( + * d_temp_storage, temp_storage_bytes, + * d_data, num_items); + * + * // d_data <-- [0, 8, 14, 21, 26, 29, 29] + * + * @endcode + * + * @tparam IteratorT + * **[inferred]** Random-access iterator type for reading scan + * inputs and wrigin scan outputs + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no + * work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in,out] d_data + * Random-access iterator to the sequence of data items + * + * @param[in] num_items + * Total number of input items (i.e., the length of `d_in`) + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. May cause significant slowdown. + * Default is `false`. + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t + ExclusiveSum(void *d_temp_storage, + size_t &temp_storage_bytes, + IteratorT d_data, + int num_items, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + return ExclusiveSum(d_temp_storage, + temp_storage_bytes, + d_data, + d_data, + num_items, + stream, + debug_synchronous); + } + + /** + * @brief Computes a device-wide exclusive prefix scan using the specified + * binary `scan_op` functor. The `init_value` value is applied as + * the initial value, and is assigned to `*d_out`. + * + * @par + * - Supports non-commutative scan operators. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. + * - When `d_in` and `d_out` are equal, the scan is performed in-place. The + * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` + * shall not overlap in any other way. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the exclusive prefix min-scan of an + * `int` device vector + * @par + * @code + * #include // or equivalently + * #include // for INT_MAX + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for + * // input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * CustomMin min_op; + * ... + * + * // Determine temporary device storage requirements for exclusive + * // prefix scan + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::ExclusiveScan( + * d_temp_storage, temp_storage_bytes, + * d_in, d_out, min_op, (int) INT_MAX, num_items); + * + * // Allocate temporary storage for exclusive prefix scan + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run exclusive prefix min-scan + * cub::DeviceScan::ExclusiveScan( + * d_temp_storage, temp_storage_bytes, + * d_in, d_out, min_op, (int) INT_MAX, num_items); + * + * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] + * + * @endcode + * + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading scan + * inputs \iterator + * + * @tparam OutputIteratorT + * **[inferred]** Random-access output iterator type for writing scan + * outputs \iterator + * + * @tparam ScanOp + * **[inferred]** Binary scan functor type having member + * `T operator()(const T &a, const T &b)` + * + * @tparam InitValueT + * **[inferred]** Type of the `init_value` used Binary scan functor type + * having member `T operator()(const T &a, const T &b)` + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no + * work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_in + * Random-access iterator to the input sequence of data items + * + * @param[out] d_out + * Random-access iterator to the output sequence of data items + * + * @param[in] scan_op + * Binary scan functor + * + * @param[in] init_value + * Initial value to seed the exclusive scan (and is assigned to *d_out) + * + * @param[in] num_items + * Total number of input items (i.e., the length of \p d_in) + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. Default is + * stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. May cause significant slowdown. + * Default is `false`. + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t + ExclusiveScan(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + ScanOpT scan_op, + InitValueT init_value, + int num_items, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int ; + + return DispatchScan, + OffsetT>::Dispatch(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + detail::InputValue( + init_value), + num_items, + stream, + debug_synchronous); + } + + /** + * @brief Computes a device-wide exclusive prefix scan using the specified + * binary `scan_op` functor. The `init_value` value is applied as + * the initial value, and is assigned to `*d_data`. + * + * @par + * - Supports non-commutative scan operators. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the exclusive prefix min-scan of an + * `int` device vector + * @par + * @code + * #include // or equivalently + * #include // for INT_MAX + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for + * // input and output + * int num_items; // e.g., 7 + * int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] + * CustomMin min_op; + * ... + * + * // Determine temporary device storage requirements for exclusive + * // prefix scan + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::ExclusiveScan( + * d_temp_storage, temp_storage_bytes, + * d_data, min_op, (int) INT_MAX, num_items); + * + * // Allocate temporary storage for exclusive prefix scan + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run exclusive prefix min-scan + * cub::DeviceScan::ExclusiveScan( + * d_temp_storage, temp_storage_bytes, + * d_data, min_op, (int) INT_MAX, num_items); + * + * // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0] + * + * @endcode + * + * @tparam IteratorT + * **[inferred]** Random-access input iterator type for reading scan + * inputs and writing scan outputs + * + * @tparam ScanOp + * **[inferred]** Binary scan functor type having member + * `T operator()(const T &a, const T &b)` + * + * @tparam InitValueT + * **[inferred]** Type of the `init_value` used Binary scan functor type + * having member `T operator()(const T &a, const T &b)` + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no + * work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in,out] d_data + * Random-access iterator to the sequence of data items + * + * @param[in] scan_op + * Binary scan functor + * + * @param[in] init_value + * Initial value to seed the exclusive scan (and is assigned to *d_out) + * + * @param[in] num_items + * Total number of input items (i.e., the length of \p d_in) + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. Default is + * stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. May cause significant slowdown. + * Default is `false`. + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t + ExclusiveScan(void *d_temp_storage, + size_t &temp_storage_bytes, + IteratorT d_data, + ScanOpT scan_op, + InitValueT init_value, + int num_items, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + return ExclusiveScan(d_temp_storage, + temp_storage_bytes, + d_data, + d_data, + scan_op, + init_value, + num_items, + stream, + debug_synchronous); + } + + /** + * @brief Computes a device-wide exclusive prefix scan using the specified + * binary `scan_op` functor. The `init_value` value is provided as + * a future value. + * + * @par + * - Supports non-commutative scan operators. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. + * - When `d_in` and `d_out` are equal, the scan is performed in-place. The + * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` + * shall not overlap in any other way. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the exclusive prefix min-scan of an + * `int` device vector + * @par + * @code + * #include // or equivalently + * #include // for INT_MAX + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for + * // input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * int *d_init_iter; // e.g., INT_MAX + * CustomMin min_op; + * + * auto future_init_value = + * cub::FutureValue(d_init_iter); + * + * ... + * + * // Determine temporary device storage requirements for exclusive + * // prefix scan + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::ExclusiveScan( + * d_temp_storage, temp_storage_bytes, + * d_in, d_out, min_op, future_init_value, num_items); + * + * // Allocate temporary storage for exclusive prefix scan + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run exclusive prefix min-scan + * cub::DeviceScan::ExclusiveScan( + * d_temp_storage, temp_storage_bytes, + * d_in, d_out, min_op, future_init_value, num_items); + * + * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] + * + * @endcode + * + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading scan + * inputs \iterator + * + * @tparam OutputIteratorT + * **[inferred]** Random-access output iterator type for writing scan + * outputs \iterator + * + * @tparam ScanOp + * **[inferred]** Binary scan functor type having member + * `T operator()(const T &a, const T &b)` + * + * @tparam InitValueT + * **[inferred]** Type of the `init_value` used Binary scan functor type + * having member `T operator()(const T &a, const T &b)` + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work + * is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of \p d_temp_storage allocation + * + * @param[in] d_in + * Pointer to the input sequence of data items + * + * @param[out] d_out + * Pointer to the output sequence of data items + * + * @param[in] scan_op + * Binary scan functor + * + * @param[in] init_value + * Initial value to seed the exclusive scan (and is assigned to `*d_out`) + * + * @param[in] num_items + * Total number of input items (i.e., the length of `d_in`) + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. May cause significant slowdown. + * Default is `false`. + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t + ExclusiveScan(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + ScanOpT scan_op, + FutureValue init_value, + int num_items, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int; + + return DispatchScan, + OffsetT>::Dispatch(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + detail::InputValue( + init_value), + num_items, + stream, + debug_synchronous); + } + + /** + * @brief Computes a device-wide exclusive prefix scan using the specified + * binary `scan_op` functor. The `init_value` value is provided as + * a future value. + * + * @par + * - Supports non-commutative scan operators. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the exclusive prefix min-scan of an + * `int` device vector + * @par + * @code + * #include // or equivalently + * #include // for INT_MAX + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for + * // input and output + * int num_items; // e.g., 7 + * int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_init_iter; // e.g., INT_MAX + * CustomMin min_op; + * + * auto future_init_value = + * cub::FutureValue(d_init_iter); + * + * ... + * + * // Determine temporary device storage requirements for exclusive + * // prefix scan + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::ExclusiveScan( + * d_temp_storage, temp_storage_bytes, + * d_data, min_op, future_init_value, num_items); + * + * // Allocate temporary storage for exclusive prefix scan + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run exclusive prefix min-scan + * cub::DeviceScan::ExclusiveScan( + * d_temp_storage, temp_storage_bytes, + * d_data, min_op, future_init_value, num_items); + * + * // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0] + * + * @endcode + * + * @tparam IteratorT + * **[inferred]** Random-access input iterator type for reading scan + * inputs and writing scan outputs + * + * @tparam ScanOp + * **[inferred]** Binary scan functor type having member + * `T operator()(const T &a, const T &b)` + * + * @tparam InitValueT + * **[inferred]** Type of the `init_value` used Binary scan functor type + * having member `T operator()(const T &a, const T &b)` + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work + * is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of \p d_temp_storage allocation + * + * @param[in,out] d_data + * Pointer to the sequence of data items + * + * @param[in] scan_op + * Binary scan functor + * + * @param[in] init_value + * Initial value to seed the exclusive scan (and is assigned to `*d_out`) + * + * @param[in] num_items + * Total number of input items (i.e., the length of `d_in`) + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. May cause significant slowdown. + * Default is `false`. + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t + ExclusiveScan(void *d_temp_storage, + size_t &temp_storage_bytes, + IteratorT d_data, + ScanOpT scan_op, + FutureValue init_value, + int num_items, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + return ExclusiveScan(d_temp_storage, + temp_storage_bytes, + d_data, + d_data, + scan_op, + init_value, + num_items, + stream, + debug_synchronous); + } + + //@} end member group + /******************************************************************//** + * @name Inclusive scans + *********************************************************************/ + //@{ + + + /** + * @brief Computes a device-wide inclusive prefix sum. + * + * @par + * - Supports non-commutative sum operators. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. + * - When `d_in` and `d_out` are equal, the scan is performed in-place. The + * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` + * shall not overlap in any other way. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the inclusive prefix sum of an `int` + * device vector. + * + * @par + * @code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for + * // input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * ... + * + * // Determine temporary device storage requirements for inclusive + * // prefix sum + * void *d_temp_storage = nullptr; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::InclusiveSum( + * d_temp_storage, temp_storage_bytes, + * d_in, d_out, num_items); + * + * // Allocate temporary storage for inclusive prefix sum + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run inclusive prefix sum + * cub::DeviceScan::InclusiveSum( + * d_temp_storage, temp_storage_bytes, + * d_in, d_out, num_items); + * + * // d_out <-- [8, 14, 21, 26, 29, 29, 38] + * + * @endcode + * + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading scan + * inputs \iterator + * + * @tparam OutputIteratorT + * **[inferred]** Random-access output iterator type for writing scan + * outputs \iterator + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no + * work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_in + * Random-access iterator to the input sequence of data items + * + * @param[out] d_out + * Random-access iterator to the output sequence of data items + * + * @param[in] num_items + * Total number of input items (i.e., the length of `d_in`) + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. May cause significant slowdown. + * Default is `false`. + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t + InclusiveSum(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_items, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int; + + return DispatchScan::Dispatch(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + Sum(), + NullType(), + num_items, + stream, + debug_synchronous); + } + + /** + * @brief Computes a device-wide inclusive prefix sum in-place. + * + * @par + * - Supports non-commutative sum operators. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the inclusive prefix sum of an `int` + * device vector. + * + * @par + * @code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for + * // input and output + * int num_items; // e.g., 7 + * int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] + * ... + * + * // Determine temporary device storage requirements for inclusive + * // prefix sum + * void *d_temp_storage = nullptr; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::InclusiveSum( + * d_temp_storage, temp_storage_bytes, + * d_data, num_items); + * + * // Allocate temporary storage for inclusive prefix sum + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run inclusive prefix sum + * cub::DeviceScan::InclusiveSum( + * d_temp_storage, temp_storage_bytes, + * d_data, num_items); + * + * // d_data <-- [8, 14, 21, 26, 29, 29, 38] + * + * @endcode + * + * @tparam IteratorT + * **[inferred]** Random-access input iterator type for reading scan + * inputs and writing scan outputs + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no + * work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in,out] d_data + * Random-access iterator to the sequence of data items + * + * @param[in] num_items + * Total number of input items (i.e., the length of `d_in`) + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. May cause significant slowdown. + * Default is `false`. + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t + InclusiveSum(void *d_temp_storage, + size_t &temp_storage_bytes, + IteratorT d_data, + int num_items, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + return InclusiveSum(d_temp_storage, + temp_storage_bytes, + d_data, + d_data, + num_items, + stream, + debug_synchronous); + } + + /** + * @brief Computes a device-wide inclusive prefix scan using the specified + * binary `scan_op` functor. + * + * @par + * - Supports non-commutative scan operators. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. + * - When `d_in` and `d_out` are equal, the scan is performed in-place. The + * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` + * shall not overlap in any other way. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the inclusive prefix min-scan of an + * `int` device vector. + * + * @par + * @code + * #include // or equivalently + * #include // for INT_MAX + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for + * // input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * CustomMin min_op; + * ... + * + * // Determine temporary device storage requirements for inclusive + * // prefix scan + * void *d_temp_storage = nullptr; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::InclusiveScan( + * d_temp_storage, temp_storage_bytes, + * d_in, d_out, min_op, num_items); + * + * // Allocate temporary storage for inclusive prefix scan + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run inclusive prefix min-scan + * cub::DeviceScan::InclusiveScan( + * d_temp_storage, temp_storage_bytes, + * d_in, d_out, min_op, num_items); + * + * // d_out <-- [8, 6, 6, 5, 3, 0, 0] + * + * @endcode + * + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading scan + * inputs \iterator + * + * @tparam OutputIteratorT + * **[inferred]** Random-access output iterator type for writing scan + * outputs \iterator + * + * @tparam ScanOp + * **[inferred]** Binary scan functor type having member + * `T operator()(const T &a, const T &b)` + * + * @param[in] + * d_temp_storage Device-accessible allocation of temporary storage. + * When `nullptr`, the required allocation size is written to + * `temp_storage_bytes` and no work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_in + * Random-access iterator to the input sequence of data items + * + * @param[out] d_out + * Random-access iterator to the output sequence of data items + * + * @param[in] scan_op + * Binary scan functor + * + * @param[in] num_items + * Total number of input items (i.e., the length of `d_in`) + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. May cause significant slowdown. + * Default is `false`. + * + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t + InclusiveScan(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + ScanOpT scan_op, + int num_items, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int; + + return DispatchScan::Dispatch(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + NullType(), + num_items, + stream, + debug_synchronous); + } + + /** + * @brief Computes a device-wide inclusive prefix scan using the specified + * binary `scan_op` functor. + * + * @par + * - Supports non-commutative scan operators. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the inclusive prefix min-scan of an + * `int` device vector. + * + * @par + * @code + * #include // or equivalently + * #include // for INT_MAX + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for + * // input and output + * int num_items; // e.g., 7 + * int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] + * CustomMin min_op; + * ... + * + * // Determine temporary device storage requirements for inclusive + * // prefix scan + * void *d_temp_storage = nullptr; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::InclusiveScan( + * d_temp_storage, temp_storage_bytes, + * d_data, min_op, num_items); + * + * // Allocate temporary storage for inclusive prefix scan + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run inclusive prefix min-scan + * cub::DeviceScan::InclusiveScan( + * d_temp_storage, temp_storage_bytes, + * d_in, d_out, min_op, num_items); + * + * // d_data <-- [8, 6, 6, 5, 3, 0, 0] + * + * @endcode + * + * @tparam IteratorT + * **[inferred]** Random-access input iterator type for reading scan + * inputs and writing scan outputs + * + * @tparam ScanOp + * **[inferred]** Binary scan functor type having member + * `T operator()(const T &a, const T &b)` + * + * @param[in] + * d_temp_storage Device-accessible allocation of temporary storage. + * When `nullptr`, the required allocation size is written to + * `temp_storage_bytes` and no work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_data + * Random-access iterator to the sequence of data items + * + * @param[in] scan_op + * Binary scan functor + * + * @param[in] num_items + * Total number of input items (i.e., the length of `d_in`) + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. May cause significant slowdown. + * Default is `false`. + * + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t + InclusiveScan(void *d_temp_storage, + size_t &temp_storage_bytes, + IteratorT d_data, + ScanOpT scan_op, + int num_items, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + return InclusiveScan(d_temp_storage, + temp_storage_bytes, + d_data, + d_data, + scan_op, + num_items, + stream, + debug_synchronous); + } + + /** + * @brief Computes a device-wide exclusive prefix sum-by-key with key equality + * defined by `equality_op`. The value of `0` is applied as the initial + * value, and is assigned to the beginning of each segment in + * `d_values_out`. + * + * @par + * - Supports non-commutative sum operators. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. + * - `d_keys_in` may equal `d_values_out` but the range + * `[d_keys_in, d_keys_in + num_items)` and the range + * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. + * - `d_values_in` may equal `d_values_out` but the range + * `[d_values_in, d_values_in + num_items)` and the range + * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the exclusive prefix sum-by-key of an + * `int` device vector. + * @par + * @code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for + * // input and output + * int num_items; // e.g., 7 + * int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] + * int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_values_out; // e.g., [ , , , , , , ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = nullptr; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::ExclusiveSumByKey( + * d_temp_storage, temp_storage_bytes, + * d_keys_in, d_values_in, d_values_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run exclusive prefix sum + * cub::DeviceScan::ExclusiveSumByKey( + * d_temp_storage, temp_storage_bytes, + * d_keys_in, d_values_in, d_values_out, num_items); + * + * // d_values_out <-- [0, 8, 0, 7, 12, 0, 0] + * + * @endcode + * + * @tparam KeysInputIteratorT + * **[inferred]** Random-access input iterator type for reading scan keys + * inputs \iterator + * + * @tparam ValuesInputIteratorT + * **[inferred]** Random-access input iterator type for reading scan + * values inputs \iterator + * + * @tparam ValuesOutputIteratorT + * **[inferred]** Random-access output iterator type for writing scan + * values outputs \iterator + * + * @tparam EqualityOpT + * **[inferred]** Functor type having member + * `T operator()(const T &a, const T &b)` for binary operations that + * defines the equality of keys + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no + * work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_keys_in + * Random-access input iterator to the input sequence of key items + * + * @param[in] d_values_in + * Random-access input iterator to the input sequence of value items + * + * @param[out] d_values_out + * Random-access output iterator to the output sequence of value items + * + * @param[in] num_items + * Total number of input items (i.e., the length of `d_keys_in` and + * `d_values_in`) + * + * @param[in] equality_op + * Binary functor that defines the equality of keys. + * Default is cub::Equality(). + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. May cause significant slowdown. + * Default is `false`. + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t + ExclusiveSumByKey(void *d_temp_storage, + size_t &temp_storage_bytes, + KeysInputIteratorT d_keys_in, + ValuesInputIteratorT d_values_in, + ValuesOutputIteratorT d_values_out, + int num_items, + EqualityOpT equality_op = EqualityOpT(), + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int; + + // The output value type -- used as the intermediate accumulator + // Use the input value type per https://wg21.link/P0571 + using OutputT = cub::detail::value_t; + + // Initial value + OutputT init_value = 0; + + return DispatchScanByKey::Dispatch(d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + d_values_out, + equality_op, + Sum(), + init_value, + num_items, + stream, + debug_synchronous); + } + + /** + * @brief Computes a device-wide exclusive prefix scan-by-key using the + * specified binary `scan_op` functor. The key equality is defined by + * `equality_op`. The `init_value` value is applied as the initial + * value, and is assigned to the beginning of each segment in + * `d_values_out`. + * + * @par + * - Supports non-commutative scan operators. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. + * - `d_keys_in` may equal `d_values_out` but the range + * `[d_keys_in, d_keys_in + num_items)` and the range + * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. + * - `d_values_in` may equal `d_values_out` but the range + * `[d_values_in, d_values_in + num_items)` and the range + * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the exclusive prefix min-scan-by-key of + * an `int` device vector + * @par + * @code + * #include // or equivalently + * #include // for INT_MAX + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // CustomEqual functor + * struct CustomEqual + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return a == b; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for + * // input and output + * int num_items; // e.g., 7 + * int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] + * int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_values_out; // e.g., [ , , , , , , ] + * CustomMin min_op; + * CustomEqual equality_op; + * ... + * + * // Determine temporary device storage requirements for exclusive + * // prefix scan + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::ExclusiveScanByKey( + * d_temp_storage, temp_storage_bytes, + * d_keys_in, d_values_in, d_values_out, min_op, + * (int) INT_MAX, num_items, equality_op); + * + * // Allocate temporary storage for exclusive prefix scan + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run exclusive prefix min-scan + * cub::DeviceScan::ExclusiveScanByKey( + * d_temp_storage, temp_storage_bytes, + * d_keys_in, d_values_in, d_values_out, min_op, + * (int) INT_MAX, num_items, equality_op); + * + * // d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0] + * + * @endcode + * + * @tparam KeysInputIteratorT + * **[inferred]** Random-access input iterator type for reading scan keys + * inputs \iterator + * + * @tparam ValuesInputIteratorT + * **[inferred]** Random-access input iterator type for reading scan values + * inputs \iterator + * + * @tparam ValuesOutputIteratorT + * **[inferred]** Random-access output iterator type for writing scan values + * outputs \iterator + * + * @tparam ScanOp + * **[inferred]** Binary scan functor type having member + * `T operator()(const T &a, const T &b)` + * + * @tparam InitValueT + * **[inferred]** Type of the `init_value` value used in Binary scan + * functor type having member `T operator()(const T &a, const T &b)` + * + * @tparam EqualityOpT + * **[inferred]** Functor type having member + * `T operator()(const T &a, const T &b)` for binary operations that + * defines the equality of keys + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work + * is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_keys_in + * Random-access input iterator to the input sequence of key items + * + * @param[in] d_values_in + * Random-access input iterator to the input sequence of value items + * + * @param[out] d_values_out + * Random-access output iterator to the output sequence of value items + * + * @param[in] scan_op + * Binary scan functor + * + * @param[in] init_value + * Initial value to seed the exclusive scan (and is assigned to the + * beginning of each segment in `d_values_out`) + * + * @param[in] num_items + * Total number of input items (i.e., the length of `d_keys_in` and + * `d_values_in`) + * + * @param[in] equality_op + * Binary functor that defines the equality of keys. + * Default is cub::Equality(). + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. May cause significant slowdown. + * Default is `false`. + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t + ExclusiveScanByKey(void *d_temp_storage, + size_t &temp_storage_bytes, + KeysInputIteratorT d_keys_in, + ValuesInputIteratorT d_values_in, + ValuesOutputIteratorT d_values_out, + ScanOpT scan_op, + InitValueT init_value, + int num_items, + EqualityOpT equality_op = EqualityOpT(), + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int ; + + return DispatchScanByKey::Dispatch(d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + d_values_out, + equality_op, + scan_op, + init_value, + num_items, + stream, + debug_synchronous); + } + + /** + * @brief Computes a device-wide inclusive prefix sum-by-key with key + * equality defined by `equality_op`. + * + * @par + * - Supports non-commutative sum operators. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. + * - `d_keys_in` may equal `d_values_out` but the range + * `[d_keys_in, d_keys_in + num_items)` and the range + * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. + * - `d_values_in` may equal `d_values_out` but the range + * `[d_values_in, d_values_in + num_items)` and the range + * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the inclusive prefix sum-by-key of an + * `int` device vector. + * @par + * @code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for + * // input and output + * int num_items; // e.g., 7 + * int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] + * int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_values_out; // e.g., [ , , , , , , ] + * ... + * + * // Determine temporary device storage requirements for inclusive prefix sum + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::InclusiveSumByKey( + * d_temp_storage, temp_storage_bytes, + * d_keys_in, d_values_in, d_values_out, num_items); + * + * // Allocate temporary storage for inclusive prefix sum + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run inclusive prefix sum + * cub::DeviceScan::InclusiveSumByKey( + * d_temp_storage, temp_storage_bytes, + * d_keys_in, d_values_in, d_values_out, num_items); + * + * // d_out <-- [8, 14, 7, 12, 15, 0, 9] + * + * @endcode + * + * @tparam KeysInputIteratorT + * **[inferred]** Random-access input iterator type for reading scan + * keys inputs \iterator + * + * @tparam ValuesInputIteratorT + * **[inferred]** Random-access input iterator type for reading scan + * values inputs \iterator + * + * @tparam ValuesOutputIteratorT + * **[inferred]** Random-access output iterator type for writing scan + * values outputs \iterator + * + * @tparam EqualityOpT + * **[inferred]** Functor type having member + * `T operator()(const T &a, const T &b)` for binary operations that + * defines the equality of keys + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. + * When `nullptr`, the required allocation size is written to + * `temp_storage_bytes` and no work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_keys_in + * Random-access input iterator to the input sequence of key items + * + * @param[in] d_values_in + * Random-access input iterator to the input sequence of value items + * + * @param[out] d_values_out + * Random-access output iterator to the output sequence of value items + * + * @param[in] num_items + * Total number of input items (i.e., the length of `d_keys_in` and + * `d_values_in`) + * + * @param[in] equality_op + * Binary functor that defines the equality of keys. + * Default is cub::Equality(). + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. May cause significant slowdown. + * Default is `false`. + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t + InclusiveSumByKey(void *d_temp_storage, + size_t &temp_storage_bytes, + KeysInputIteratorT d_keys_in, + ValuesInputIteratorT d_values_in, + ValuesOutputIteratorT d_values_out, + int num_items, + EqualityOpT equality_op = EqualityOpT(), + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int ; + + return DispatchScanByKey::Dispatch(d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + d_values_out, + equality_op, + Sum(), + NullType(), + num_items, + stream, + debug_synchronous); + } + + /** + * @brief Computes a device-wide inclusive prefix scan-by-key using the + * specified binary `scan_op` functor. The key equality is defined + * by `equality_op`. + * + * @par + * - Supports non-commutative scan operators. + * - Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. Additional details can be found in + * the [decoupled look-back] description. + * - `d_keys_in` may equal `d_values_out` but the range + * `[d_keys_in, d_keys_in + num_items)` and the range + * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. + * - `d_values_in` may equal `d_values_out` but the range + * `[d_values_in, d_values_in + num_items)` and the range + * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the inclusive prefix min-scan-by-key + * of an `int` device vector. + * @par + * @code + * #include // or equivalently + * #include // for INT_MAX + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // CustomEqual functor + * struct CustomEqual + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return a == b; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for + * // input and output + * int num_items; // e.g., 7 + * int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] + * int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_values_out; // e.g., [ , , , , , , ] + * CustomMin min_op; + * CustomEqual equality_op; + * ... + * + * // Determine temporary device storage requirements for inclusive prefix scan + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::InclusiveScanByKey( + * d_temp_storage, temp_storage_bytes, + * d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op); + * + * // Allocate temporary storage for inclusive prefix scan + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run inclusive prefix min-scan + * cub::DeviceScan::InclusiveScanByKey( + * d_temp_storage, temp_storage_bytes, + * d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op); + * + * // d_out <-- [8, 6, 7, 5, 3, 0, 0] + * + * @endcode + * + * @tparam KeysInputIteratorT + * **[inferred]** Random-access input iterator type for reading scan keys + * inputs \iterator + * + * @tparam ValuesInputIteratorT + * **[inferred]** Random-access input iterator type for reading scan + * values inputs \iterator + * + * @tparam ValuesOutputIteratorT + * **[inferred]** Random-access output iterator type for writing scan + * values outputs \iterator + * + * @tparam ScanOp + * **[inferred]** Binary scan functor type having member + * `T operator()(const T &a, const T &b)` + * + * @tparam EqualityOpT + * **[inferred]** Functor type having member + * `T operator()(const T &a, const T &b)` for binary operations that + * defines the equality of keys + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. + * When `nullptr`, the required allocation size is written to + * `temp_storage_bytes` and no work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_keys_in + * Random-access input iterator to the input sequence of key items + * + * @param[in] d_values_in + * Random-access input iterator to the input sequence of value items + * + * @param[out] d_values_out + * Random-access output iterator to the output sequence of value items + * + * @param[in] scan_op + * Binary scan functor + * + * @param[in] num_items + * Total number of input items (i.e., the length of `d_keys_in` and + * `d_values_in`) + * + * @param[in] equality_op + * Binary functor that defines the equality of keys. + * Default is cub::Equality(). + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. May cause significant slowdown. + * Default is `false`. + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t + InclusiveScanByKey(void *d_temp_storage, + size_t &temp_storage_bytes, + KeysInputIteratorT d_keys_in, + ValuesInputIteratorT d_values_in, + ValuesOutputIteratorT d_values_out, + ScanOpT scan_op, + int num_items, + EqualityOpT equality_op = EqualityOpT(), + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int; + + return DispatchScanByKey::Dispatch(d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + d_values_out, + equality_op, + scan_op, + NullType(), + num_items, + stream, + debug_synchronous); + } + //@} end member group }; /** - * \example example_device_scan.cu + * @example example_device_scan.cu */ CUB_NAMESPACE_END diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh index c2d04588be..2d452d3323 100644 --- a/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/device/dispatch/dispatch_scan.cuh @@ -156,7 +156,7 @@ struct DeviceScanPolicy 128, 12, ///< Threads per block, items per thread OutputT, BLOCK_LOAD_DIRECT, - LOAD_LDG, + LOAD_CA, BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, BLOCK_SCAN_RAKING> ScanPolicyT; @@ -170,7 +170,7 @@ struct DeviceScanPolicy 128, 12, ///< Threads per block, items per thread OutputT, BLOCK_LOAD_DIRECT, - LOAD_LDG, + LOAD_CA, ScanTransposedStore, BLOCK_SCAN_WARP_SCANS> ScanPolicyT; @@ -289,6 +289,12 @@ struct DispatchScan: typedef typename ActivePolicyT::ScanPolicyT Policy; typedef typename cub::ScanTileState ScanTileStateT; + // `LOAD_LDG` makes in-place execution UB and doesn't lead to better + // performance. + static_assert( + Policy::LOAD_MODIFIER != CacheLoadModifier::LOAD_LDG, + "The memory consistency model does not apply to texture accesses"); + cudaError error = cudaSuccess; do { diff --git a/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/device/dispatch/dispatch_scan_by_key.cuh index ab71990e88..ff22a6208c 100644 --- a/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -63,10 +63,12 @@ template < typename EqualityOp, ///< Equality functor type typename ScanOpT, ///< Scan functor type typename InitValueT, ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan) - typename OffsetT> ///< Signed integer type for global offsets + typename OffsetT, ///< Signed integer type for global offsets + typename KeyT = cub::detail::value_t> __launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THREADS)) __global__ void DeviceScanByKeyKernel( KeysInputIteratorT d_keys_in, ///< Input keys data + KeyT *d_keys_prev_in, ///< Predecessor items for each tile ValuesInputIteratorT d_values_in, ///< Input values data ValuesOutputIteratorT d_values_out, ///< Output values data ScanByKeyTileStateT tile_state, ///< Tile status interface @@ -96,6 +98,7 @@ __global__ void DeviceScanByKeyKernel( AgentScanByKeyT( temp_storage, d_keys_in, + d_keys_prev_in, d_values_in, d_values_out, equality_op, @@ -107,6 +110,25 @@ __global__ void DeviceScanByKeyKernel( start_tile); } +template +__global__ void DeviceScanByKeyInitKernel( + ScanTileStateT tile_state, + KeysInputIteratorT d_keys_in, + cub::detail::value_t *d_keys_prev_in, + unsigned items_per_tile, + int num_tiles) +{ + // Initialize tile status + tile_state.InitializeStatus(num_tiles); + + const unsigned tid = threadIdx.x + blockDim.x * blockIdx.x; + const unsigned tile_base = tid * items_per_tile; + + if (tid > 0 && tid < num_tiles) + { + d_keys_prev_in[tid] = d_keys_in[tile_base - 1]; + } +} /****************************************************************************** * Policy @@ -138,7 +160,7 @@ struct DeviceScanByKeyPolicy typedef AgentScanByKeyPolicy< 128, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_LDG, + LOAD_CA, BLOCK_SCAN_WARP_SCANS, BLOCK_STORE_WARP_TRANSPOSE> ScanByKeyPolicyT; @@ -158,7 +180,7 @@ struct DeviceScanByKeyPolicy typedef AgentScanByKeyPolicy< 256, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_LDG, + LOAD_CA, BLOCK_SCAN_WARP_SCANS, BLOCK_STORE_WARP_TRANSPOSE> ScanByKeyPolicyT; @@ -283,11 +305,13 @@ struct DispatchScanByKey: int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); // Specify temporary storage allocation requirements - size_t allocation_sizes[1]; + size_t allocation_sizes[2]; if (CubDebug(error = ScanByKeyTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + allocation_sizes[1] = sizeof(KeyT) * (num_tiles + 1); + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) - void* allocations[1] = {}; + void* allocations[2] = {}; if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; if (d_temp_storage == NULL) { @@ -299,6 +323,8 @@ struct DispatchScanByKey: if (num_items == 0) break; + KeyT *d_keys_prev_in = reinterpret_cast(allocations[1]); + // Construct the tile status interface ScanByKeyTileStateT tile_state; if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; @@ -310,7 +336,7 @@ struct DispatchScanByKey: // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( init_grid_size, INIT_KERNEL_THREADS, 0, stream - ).doit(init_kernel, tile_state, num_tiles); + ).doit(init_kernel, tile_state, d_keys_in, d_keys_prev_in, tile_size, num_tiles); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) break; @@ -318,7 +344,6 @@ struct DispatchScanByKey: // Sync the stream if specified to flush runtime errors if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - // Get SM occupancy for scan_kernel int scan_sm_occupancy; if (CubDebug(error = MaxSmOccupancy( @@ -344,6 +369,7 @@ struct DispatchScanByKey: ).doit( scan_kernel, d_keys_in, + d_keys_prev_in, d_values_in, d_values_out, tile_state, @@ -375,7 +401,7 @@ struct DispatchScanByKey: typedef ReduceByKeyScanTileState ScanByKeyTileStateT; // Ensure kernels are instantiated. return Invoke( - DeviceScanInitKernel, + DeviceScanByKeyInitKernel, DeviceScanByKeyKernel< MaxPolicyT, KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, ScanByKeyTileStateT, EqualityOp, ScanOpT, InitValueT, OffsetT> diff --git a/test/test_device_scan.cu b/test/test_device_scan.cu index 7c4efdd41a..6432d5caac 100644 --- a/test/test_device_scan.cu +++ b/test/test_device_scan.cu @@ -89,124 +89,295 @@ struct WrapperFunctor /** * Dispatch to exclusive scan entrypoint */ -template -CUB_RUNTIME_FUNCTION __forceinline__ -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - IsPrimitiveT /*is_primitive*/, - int timing_timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - ScanOpT scan_op, - InitialValueT initial_value, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) +template +CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t +Dispatch(Int2Type /*in_place*/, + Int2Type /*dispatch_to*/, + IsPrimitiveT /*is_primitive*/, + int timing_timing_iterations, + size_t * /* d_temp_storage_bytes */, + cudaError_t * /* d_cdp_error */, + void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT /* d_out */, + ScanOpT scan_op, + InitialValueT initial_value, + OffsetT num_items, + cudaStream_t stream, + bool debug_synchronous) { - cudaError_t error = cudaSuccess; - for (int i = 0; i < timing_timing_iterations; ++i) - { - error = DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, initial_value, num_items, stream, debug_synchronous); - } - return error; + cudaError_t error = cudaSuccess; + for (int i = 0; i < timing_timing_iterations; ++i) + { + error = DeviceScan::ExclusiveScan(d_temp_storage, + temp_storage_bytes, + d_in, + scan_op, + initial_value, + num_items, + stream, + debug_synchronous); + } + return error; } +template +CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t +Dispatch(Int2Type /*in_place*/, + Int2Type /*dispatch_to*/, + IsPrimitiveT /*is_primitive*/, + int timing_timing_iterations, + size_t * /*d_temp_storage_bytes*/, + cudaError_t * /*d_cdp_error*/, + void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + ScanOpT scan_op, + InitialValueT initial_value, + OffsetT num_items, + cudaStream_t stream, + bool debug_synchronous) +{ + cudaError_t error = cudaSuccess; + for (int i = 0; i < timing_timing_iterations; ++i) + { + error = DeviceScan::ExclusiveScan(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + initial_value, + num_items, + stream, + debug_synchronous); + } + return error; +} /** * Dispatch to exclusive sum entrypoint */ -template -CUB_RUNTIME_FUNCTION __forceinline__ -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - Int2Type /*is_primitive*/, - int timing_timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - Sum /*scan_op*/, - InitialValueT /*initial_value*/, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) +template +CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t +Dispatch(Int2Type /*in_place*/, + Int2Type /*dispatch_to*/, + Int2Type /*is_primitive*/, + int timing_timing_iterations, + size_t * /*d_temp_storage_bytes*/, + cudaError_t * /*d_cdp_error*/, + void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT /* d_out */, + Sum /*scan_op*/, + InitialValueT /*initial_value*/, + OffsetT num_items, + cudaStream_t stream, + bool debug_synchronous) { - cudaError_t error = cudaSuccess; - for (int i = 0; i < timing_timing_iterations; ++i) - { - error = DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous); - } - return error; + cudaError_t error = cudaSuccess; + for (int i = 0; i < timing_timing_iterations; ++i) + { + error = DeviceScan::ExclusiveSum(d_temp_storage, + temp_storage_bytes, + d_in, + num_items, + stream, + debug_synchronous); + } + return error; } +template +CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t +Dispatch(Int2Type /*in_place*/, + Int2Type /*dispatch_to*/, + Int2Type /*is_primitive*/, + int timing_timing_iterations, + size_t * /*d_temp_storage_bytes*/, + cudaError_t * /*d_cdp_error*/, + void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + Sum /*scan_op*/, + InitialValueT /*initial_value*/, + OffsetT num_items, + cudaStream_t stream, + bool debug_synchronous) +{ + cudaError_t error = cudaSuccess; + for (int i = 0; i < timing_timing_iterations; ++i) + { + error = DeviceScan::ExclusiveSum(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + stream, + debug_synchronous); + } + return error; +} /** * Dispatch to inclusive scan entrypoint */ -template -CUB_RUNTIME_FUNCTION __forceinline__ -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - IsPrimitiveT /*is_primitive*/, - int timing_timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - ScanOpT scan_op, - NullType /*initial_value*/, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) +template +CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t +Dispatch(Int2Type /*in_place*/, + Int2Type /*dispatch_to*/, + IsPrimitiveT /*is_primitive*/, + int timing_timing_iterations, + size_t * /*d_temp_storage_bytes*/, + cudaError_t * /*d_cdp_error*/, + void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT /* d_out */, + ScanOpT scan_op, + NullType /* initial_value */, + OffsetT num_items, + cudaStream_t stream, + bool debug_synchronous) { - cudaError_t error = cudaSuccess; - for (int i = 0; i < timing_timing_iterations; ++i) - { - error = DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, num_items, stream, debug_synchronous); - } - return error; + cudaError_t error = cudaSuccess; + for (int i = 0; i < timing_timing_iterations; ++i) + { + error = DeviceScan::InclusiveScan(d_temp_storage, + temp_storage_bytes, + d_in, + scan_op, + num_items, + stream, + debug_synchronous); + } + return error; } +template +CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t +Dispatch(Int2Type /*in_place*/, + Int2Type /*dispatch_to*/, + IsPrimitiveT /*is_primitive*/, + int timing_timing_iterations, + size_t * /*d_temp_storage_bytes*/, + cudaError_t * /*d_cdp_error*/, + void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + ScanOpT scan_op, + NullType /*initial_value*/, + OffsetT num_items, + cudaStream_t stream, + bool debug_synchronous) +{ + cudaError_t error = cudaSuccess; + for (int i = 0; i < timing_timing_iterations; ++i) + { + error = DeviceScan::InclusiveScan(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + num_items, + stream, + debug_synchronous); + } + return error; +} /** * Dispatch to inclusive sum entrypoint */ template -CUB_RUNTIME_FUNCTION __forceinline__ -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - Int2Type /*is_primitive*/, - int timing_timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - Sum /*scan_op*/, - NullType /*initial_value*/, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) +CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t +Dispatch(Int2Type /*in_place*/, + Int2Type /*dispatch_to*/, + Int2Type /*is_primitive*/, + int timing_timing_iterations, + size_t * /*d_temp_storage_bytes*/, + cudaError_t * /*d_cdp_error*/, + void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT /* d_out */, + Sum /*scan_op*/, + NullType /*initial_value*/, + OffsetT num_items, + cudaStream_t stream, + bool debug_synchronous) { - cudaError_t error = cudaSuccess; - for (int i = 0; i < timing_timing_iterations; ++i) - { - error = DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous); - } - return error; + cudaError_t error = cudaSuccess; + for (int i = 0; i < timing_timing_iterations; ++i) + { + error = DeviceScan::InclusiveSum(d_temp_storage, + temp_storage_bytes, + d_in, + num_items, + stream, + debug_synchronous); + } + return error; +} + +template +CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t +Dispatch(Int2Type /*in_place*/, + Int2Type /*dispatch_to*/, + Int2Type /*is_primitive*/, + int timing_timing_iterations, + size_t * /*d_temp_storage_bytes*/, + cudaError_t * /*d_cdp_error*/, + void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + Sum /*scan_op*/, + NullType /*initial_value*/, + OffsetT num_items, + cudaStream_t stream, + bool debug_synchronous) +{ + cudaError_t error = cudaSuccess; + for (int i = 0; i < timing_timing_iterations; ++i) + { + error = DeviceScan::InclusiveSum(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + stream, + debug_synchronous); + } + return error; } //--------------------------------------------------------------------- @@ -216,104 +387,116 @@ cudaError_t Dispatch( /** * Simple wrapper kernel to invoke DeviceScan */ -template -__global__ void CnpDispatchKernel( - IsPrimitiveT is_primitive, - int timing_timing_iterations, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - - void* d_temp_storage, - size_t temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - ScanOpT scan_op, - InitialValueT initial_value, - OffsetT num_items, - bool debug_synchronous) +template +__global__ void CnpDispatchKernel(Int2Type /*in_place*/, + IsPrimitiveT is_primitive, + int timing_timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + void *d_temp_storage, + size_t temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + ScanOpT scan_op, + InitialValueT initial_value, + OffsetT num_items, + bool debug_synchronous) { #ifndef CUB_CDP - (void)is_primitive; - (void)timing_timing_iterations; - (void)d_temp_storage_bytes; - (void)d_cdp_error; - (void)d_temp_storage; - (void)temp_storage_bytes; - (void)d_in; - (void)d_out; - (void)scan_op; - (void)initial_value; - (void)num_items; - (void)debug_synchronous; - *d_cdp_error = cudaErrorNotSupported; + (void)is_primitive; + (void)timing_timing_iterations; + (void)d_temp_storage_bytes; + (void)d_cdp_error; + (void)d_temp_storage; + (void)temp_storage_bytes; + (void)d_in; + (void)d_out; + (void)scan_op; + (void)initial_value; + (void)num_items; + (void)debug_synchronous; + *d_cdp_error = cudaErrorNotSupported; #else - *d_cdp_error = Dispatch( - Int2Type(), - is_primitive, - timing_timing_iterations, - d_temp_storage_bytes, - d_cdp_error, - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - initial_value, - num_items, - 0, - debug_synchronous); - - *d_temp_storage_bytes = temp_storage_bytes; + *d_cdp_error = Dispatch(Int2Type(), + is_primitive, + timing_timing_iterations, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + initial_value, + num_items, + 0, + debug_synchronous); + + *d_temp_storage_bytes = temp_storage_bytes; #endif } - /** * Dispatch to CDP kernel */ -template -cudaError_t Dispatch( - Int2Type dispatch_to, - IsPrimitiveT is_primitive, - int timing_timing_iterations, - size_t *d_temp_storage_bytes, - cudaError_t *d_cdp_error, - - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - ScanOpT scan_op, - InitialValueT initial_value, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) +template +cudaError_t Dispatch(Int2Type /*in_place*/, + Int2Type dispatch_to, + IsPrimitiveT is_primitive, + int timing_timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + ScanOpT scan_op, + InitialValueT initial_value, + OffsetT num_items, + cudaStream_t stream, + bool debug_synchronous) { - // Invoke kernel to invoke device-side dispatch - CnpDispatchKernel<<<1,1>>>( - is_primitive, - timing_timing_iterations, - d_temp_storage_bytes, - d_cdp_error, - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - initial_value, - num_items, - debug_synchronous); - - // Copy out temp_storage_bytes - CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); - - // Copy out error - cudaError_t retval; - CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); - return retval; + // Invoke kernel to invoke device-side dispatch + CnpDispatchKernel<<<1, 1>>>(is_primitive, + timing_timing_iterations, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + initial_value, + num_items, + debug_synchronous); + + // Copy out temp_storage_bytes + CubDebugExit(cudaMemcpy(&temp_storage_bytes, + d_temp_storage_bytes, + sizeof(size_t) * 1, + cudaMemcpyDeviceToHost)); + + // Copy out error + cudaError_t retval; + CubDebugExit(cudaMemcpy(&retval, + d_cdp_error, + sizeof(cudaError_t) * 1, + cudaMemcpyDeviceToHost)); + return retval; } - //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- @@ -416,7 +599,7 @@ struct AllocateOutput { template struct AllocateOutput { - static void run(OutputT *&d_out, OutputT *d_in, int num_items) { + static void run(OutputT *&d_out, OutputT *d_in, int /* num_items */) { d_out = d_in; } }; @@ -454,6 +637,7 @@ void Test( void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(Dispatch( + Int2Type(), Int2Type(), Int2Type::PRIMITIVE>(), 1, @@ -470,11 +654,15 @@ void Test( true)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); - // Clear device output array - CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_items)); + if (!InPlace) + { + // Clear device output array + CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_items)); + } // Run warmup/correctness iteration CubDebugExit(Dispatch( + Int2Type(), Int2Type(), Int2Type::PRIMITIVE>(), 1, @@ -499,39 +687,51 @@ void Test( fflush(stderr); // Performance - GpuTimer gpu_timer; - gpu_timer.Start(); - CubDebugExit(Dispatch(Int2Type(), - Int2Type::PRIMITIVE>(), - g_timing_iterations, - d_temp_storage_bytes, - d_cdp_error, - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - initial_value, - num_items, - 0, - false)); - gpu_timer.Stop(); - float elapsed_millis = gpu_timer.ElapsedMillis(); - - // Display performance if (g_timing_iterations > 0) { - float avg_millis = elapsed_millis / g_timing_iterations; - float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f; - float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT)); - printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", - avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0); + GpuTimer gpu_timer; + gpu_timer.Start(); + CubDebugExit(Dispatch(Int2Type(), + Int2Type(), + Int2Type::PRIMITIVE>(), + g_timing_iterations, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + initial_value, + num_items, + 0, + false)); + gpu_timer.Stop(); + float elapsed_millis = gpu_timer.ElapsedMillis(); + + // Display performance + float avg_millis = elapsed_millis / g_timing_iterations; + float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f; + float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT)); + printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% " + "peak", + avg_millis, + giga_rate, + giga_bandwidth, + giga_bandwidth / g_device_giga_bandwidth * 100.0); } printf("\n\n"); // Cleanup - if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); + if (!InPlace) + { + if (d_out) + { + CubDebugExit(g_allocator.DeviceFree(d_out)); + } + } + if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); @@ -626,37 +826,30 @@ TestFutureInitValueIter( // cub::NullType does not have device pointer, so nothing to do here } -template < - Backend BACKEND, - typename DeviceInputIteratorT, - typename OutputT, - typename ScanOpT, - typename InitialValueT> -auto TestInplace( - DeviceInputIteratorT d_in, - OutputT *h_reference, - int num_items, - ScanOpT scan_op, - InitialValueT initial_value) -> typename std::enable_if::value>::type +template +auto TestInplace(OutputT *d_in, + OutputT *h_reference, + int num_items, + ScanOpT scan_op, + InitialValueT initial_value) { - Test(d_in, h_reference, num_items, scan_op, initial_value); + Test(d_in, + h_reference, + num_items, + scan_op, + initial_value); } -template < - Backend BACKEND, - typename DeviceInputIteratorT, - typename OutputT, - typename ScanOpT, - typename InitialValueT> -auto TestInplace( - DeviceInputIteratorT d_in, - OutputT *, - int, - ScanOpT, - InitialValueT) -> typename std::enable_if::value>::type -{ - (void)d_in; -} +template +auto TestInplace(DeviceInputIteratorT, OutputT *, int, ScanOpT, InitialValueT) +{} /** * Test DeviceScan on pointer type @@ -946,5 +1139,3 @@ int main(int argc, char** argv) return 0; } - - diff --git a/test/test_device_scan_by_key.cu b/test/test_device_scan_by_key.cu index c5273befda..ba1d548675 100644 --- a/test/test_device_scan_by_key.cu +++ b/test/test_device_scan_by_key.cu @@ -64,6 +64,14 @@ enum Backend }; +enum AliasMode +{ + AliasNone, // output is allocated + AliasKeys, // output is an alias of input keys + AliasValues // output is an alias of input values +}; + + /** * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants) */ @@ -473,7 +481,7 @@ struct AllocateOutput { template struct AllocateOutput { - static void run(OutputT *&d_out, OutputT *d_in, int num_items) { + static void run(OutputT *&d_out, OutputT *d_in, int /* num_items */) { d_out = d_in; } }; @@ -489,7 +497,7 @@ template < typename ScanOpT, typename InitialValueT, typename EqualityOpT, - bool InPlace=false> + AliasMode Mode=AliasNone> void Test( KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, @@ -504,7 +512,21 @@ void Test( // Allocate device output array OutputT *d_values_out = NULL; - AllocateOutput::run(d_values_out, d_values_in, num_items); + + if (Mode == AliasKeys) + { + AllocateOutput::run( + d_values_out, + d_keys_in, + num_items); + } + else + { + AllocateOutput::run( + d_values_out, + d_values_in, + num_items); + } // Allocate CDP device arrays size_t *d_temp_storage_bytes = NULL; @@ -535,7 +557,10 @@ void Test( CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Clear device output array - CubDebugExit(cudaMemset(d_values_out, 0, sizeof(OutputT) * num_items)); + if (Mode == AliasNone) + { + CubDebugExit(cudaMemset(d_values_out, 0, sizeof(OutputT) * num_items)); + } // Run warmup/correctness iteration CubDebugExit(Dispatch( @@ -557,38 +582,43 @@ void Test( true)); // Check for correctness (and display results, if specified) - int compare = CompareDeviceResults(h_reference, d_values_out, num_items, true, g_verbose); + const int compare = CompareDeviceResults(h_reference, + d_values_out, + num_items, + true, + g_verbose); + printf("\t%s", compare ? "FAIL" : "PASS"); // Flush any stdout/stderr fflush(stdout); fflush(stderr); - // Performance - GpuTimer gpu_timer; - gpu_timer.Start(); - CubDebugExit(Dispatch(Int2Type(), - Int2Type::PRIMITIVE>(), - g_timing_iterations, - d_temp_storage_bytes, - d_cdp_error, - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_values_in, - d_values_out, - scan_op, - initial_value, - num_items, - equality_op, - 0, - false)); - gpu_timer.Stop(); - float elapsed_millis = gpu_timer.ElapsedMillis(); - // Display performance if (g_timing_iterations > 0) { + // Performance + GpuTimer gpu_timer; + gpu_timer.Start(); + CubDebugExit(Dispatch(Int2Type(), + Int2Type::PRIMITIVE>(), + g_timing_iterations, + d_temp_storage_bytes, + d_cdp_error, + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + d_values_out, + scan_op, + initial_value, + num_items, + equality_op, + 0, + false)); + + gpu_timer.Stop(); + float elapsed_millis = gpu_timer.ElapsedMillis(); float avg_millis = elapsed_millis / g_timing_iterations; float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f; float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT)); @@ -599,7 +629,14 @@ void Test( printf("\n\n"); // Cleanup - if (d_values_out) CubDebugExit(g_allocator.DeviceFree(d_values_out)); + if (Mode == AliasNone) + { + if (d_values_out) + { + CubDebugExit(g_allocator.DeviceFree(d_values_out)); + } + } + if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); @@ -608,46 +645,98 @@ void Test( AssertEquals(0, compare); } -template < - Backend BACKEND, - typename KeysInputIteratorT, - typename ValuesInputIteratorT, - typename OutputT, - typename ScanOpT, - typename InitialValueT, - typename EqualityOpT> -auto TestInplace( - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - OutputT *h_reference, - int num_items, - ScanOpT scan_op, - InitialValueT initial_value, - EqualityOpT equality_op) -> typename std::enable_if::value>::type +template +auto TestInplaceValues(KeysInputIteratorT d_keys_in, + OutputT *d_values_in, + OutputT *h_reference, + int num_items, + ScanOpT scan_op, + InitialValueT initial_value, + EqualityOpT equality_op) { - Test(d_keys_in, d_values_in, h_reference, num_items, scan_op, initial_value, equality_op); + Test(d_keys_in, + d_values_in, + h_reference, + num_items, + scan_op, + initial_value, + equality_op); } -template < - Backend BACKEND, - typename KeysInputIteratorT, - typename ValuesInputIteratorT, - typename OutputT, - typename ScanOpT, - typename InitialValueT, - typename EqualityOpT> -auto TestInplace( - KeysInputIteratorT, - ValuesInputIteratorT d_values_in, - OutputT *, - int, - ScanOpT, - InitialValueT, - EqualityOpT) -> typename std::enable_if::value>::type +template +auto TestInplaceValues(KeysInputIteratorT, + ValuesInputIteratorT, + OutputT *, + int, + ScanOpT, + InitialValueT, + EqualityOpT) +{} + +template +auto TestInplaceKeys(T *d_keys_in, + ValuesInputIteratorT d_values_in, + T *h_reference, + int num_items, + ScanOpT scan_op, + InitialValueT initial_value, + EqualityOpT equality_op) { - (void)d_values_in; + Test(d_keys_in, + d_values_in, + h_reference, + num_items, + scan_op, + initial_value, + equality_op); } +template +auto TestInplaceKeys(KeysInputIteratorT, + ValuesInputIteratorT, + OutputT *, + int, + ScanOpT, + InitialValueT, + EqualityOpT) +{} + /** * Test DeviceScan on pointer type */ @@ -676,7 +765,7 @@ void TestPointer( fflush(stdout); // Allocate host arrays - KeyT* h_keys_in = new KeyT[num_items]; + KeyT* h_keys_in = new KeyT[num_items]; InputT* h_values_in = new InputT[num_items]; OutputT* h_reference = new OutputT[num_items]; @@ -711,8 +800,33 @@ void TestPointer( CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice)); // Run Test - Test(d_keys_in, d_values_in, h_reference, num_items, scan_op, initial_value, equality_op); - TestInplace(d_keys_in, d_values_in, h_reference, num_items, scan_op, initial_value, equality_op); + Test(d_keys_in, + d_values_in, + h_reference, + num_items, + scan_op, + initial_value, + equality_op); + + // Test in/out values aliasing + TestInplaceValues(d_keys_in, + d_values_in, + h_reference, + num_items, + scan_op, + initial_value, + equality_op); + + CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice)); + + // Test keys/values aliasing (should go last, changes keys) + TestInplaceKeys(d_keys_in, + d_values_in, + h_reference, + num_items, + scan_op, + initial_value, + equality_op); // Cleanup if (h_keys_in) delete[] h_keys_in; @@ -850,7 +964,7 @@ void TestKeyTAndEqualityOp( OutputT initial_value) { TestOp(num_items, identity, initial_value, Equality()); - TestOp( num_items, identity, initial_value, Mod2Equality()); + TestOp( num_items, identity, initial_value, Mod2Equality()); } /** @@ -933,7 +1047,6 @@ int main(int argc, char** argv) TestSize(num_items, (unsigned long long)0, (unsigned long long)99); - #elif TEST_VALUE_TYPES == 2 TestSize(num_items, make_uchar2(0, 0), make_uchar2(17, 21));