Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Merge branch 'main' into onesweep-begin-end-bit
Browse files Browse the repository at this point in the history
  • Loading branch information
canonizer committed May 31, 2022
2 parents 398e50f + 4e4ea96 commit a68b45e
Show file tree
Hide file tree
Showing 75 changed files with 5,278 additions and 3,106 deletions.
52 changes: 52 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,55 @@
# CUB 1.17.0

## Summary

CUB 1.17.0 is the final minor release of the 1.X series. It provides a variety
of bug fixes and miscellaneous enhancements, detailed below.

## Known Issues

### "Run-to-run" Determinism Broken

Several CUB device algorithms are documented to provide deterministic results
(per device) for non-associative reduction operators (e.g. floating-point
addition). Unfortunately, the implementations of these algorithms contain
performance optimizations that violate this guarantee.
The `DeviceReduce::ReduceByKey` and `DeviceScan` algorithms are known to be
affected. We’re currently evaluating the scope and impact of correcting this in
a future CUB release. See NVIDIA/cub#471 for details.

## Bug Fixes

- NVIDIA/cub#444: Fixed `DeviceSelect` to work with discard iterators and mixed
input/output types.
- NVIDIA/cub#452: Fixed install issue when `CMAKE_INSTALL_LIBDIR` contained
nested directories. Thanks to @robertmaynard for this contribution.
- NVIDIA/cub#462: Fixed bug that produced incorrect results
from `DeviceSegmentedSort` on sm_61 and sm_70.
- NVIDIA/cub#464: Fixed `DeviceSelect::Flagged` so that flags are normalized to
0 or 1.
- NVIDIA/cub#468: Fixed overflow issues in `DeviceRadixSort` given `num_items`
close to 2^32. Thanks to @canonizer for this contribution.

## Other Enhancements

- NVIDIA/cub#445: Remove device-sync in `DeviceSegmentedSort` when launched via
CDP.
- NVIDIA/cub#449: Fixed invalid link in documentation. Thanks to @kshitij12345
for this contribution.
- NVIDIA/cub#450: `BlockDiscontinuity`: Replaced recursive-template loop
unrolling with `#pragma unroll`. Thanks to @kshitij12345 for this
contribution.
- NVIDIA/cub#451: Replaced the deprecated `TexRefInputIterator` implementation
with an alias to `TexObjInputIterator`. This fully removes all usages of the
deprecated CUDA texture reference APIs from CUB.
- NVIDIA/cub#456: `BlockAdjacentDifference`: Replaced recursive-template loop
unrolling with `#pragma unroll`. Thanks to @kshitij12345 for this
contribution.
- NVIDIA/cub#466: `cub::DeviceAdjacentDifference` API has been updated to use
the new `OffsetT` deduction approach described in NVIDIA/cub#212.
- NVIDIA/cub#470: Fix several doxygen-related warnings. Thanks to @karthikeyann
for this contribution.

# CUB 1.16.0

## Summary
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ See the [changelog](CHANGELOG.md) for details about specific releases.
| CUB Release | Included In |
| ------------------------- | --------------------------------------- |
| 1.17.0 | TBD |
| 1.16.0 | TBD |
| 1.15.0 | NVIDIA HPC SDK 22.1 & CUDA Toolkit 11.6 |
| 1.14.0 | NVIDIA HPC SDK 21.9 |
Expand Down
7 changes: 1 addition & 6 deletions cub/agent/agent_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ template <
typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel
typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
typename OffsetT, ///< Signed integer type for global offsets
int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability
int LEGACY_PTX_ARCH = 0> ///< PTX compute capability (unused)
struct AgentHistogram
{
//---------------------------------------------------------------------
Expand Down Expand Up @@ -562,15 +562,10 @@ struct AgentHistogram
is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);

// Accumulate samples
#if CUB_PTX_ARCH >= 120
if (prefer_smem)
AccumulateSmemPixels(samples, is_valid);
else
AccumulateGmemPixels(samples, is_valid);
#else
AccumulateGmemPixels(samples, is_valid);
#endif

}


Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ struct AgentRle
// Constants
enum
{
WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
WARP_THREADS = CUB_WARP_THREADS(0),
BLOCK_THREADS = AgentRlePolicyT::BLOCK_THREADS,
ITEMS_PER_THREAD = AgentRlePolicyT::ITEMS_PER_THREAD,
WARP_ITEMS = WARP_THREADS * ITEMS_PER_THREAD,
Expand Down
62 changes: 38 additions & 24 deletions cub/agent/agent_scan_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,16 @@

#pragma once

#include <iterator>
#include <cub/agent/single_pass_scan_operators.cuh>
#include <cub/block/block_discontinuity.cuh>
#include <cub/block/block_load.cuh>
#include <cub/block/block_scan.cuh>
#include <cub/block/block_store.cuh>
#include <cub/config.cuh>
#include <cub/iterator/cache_modified_input_iterator.cuh>
#include <cub/util_type.cuh>

#include "single_pass_scan_operators.cuh"
#include "../block/block_load.cuh"
#include "../block/block_store.cuh"
#include "../block/block_scan.cuh"
#include "../block/block_discontinuity.cuh"
#include "../config.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"
#include <iterator>


CUB_NAMESPACE_BEGIN
Expand Down Expand Up @@ -138,7 +139,7 @@ struct AgentScanByKey
using TilePrefixCallbackT = TilePrefixCallbackOp<SizeValuePairT, ReduceBySegmentOpT, ScanTileStateT>;
using BlockScanT = BlockScan<SizeValuePairT, BLOCK_THREADS, AgentScanByKeyPolicyT::SCAN_ALGORITHM, 1, 1>;

union TempStorage
union TempStorage_
{
struct ScanStorage
{
Expand All @@ -152,12 +153,15 @@ struct AgentScanByKey
typename BlockStoreValuesT::TempStorage store_values;
};

struct TempStorage : cub::Uninitialized<TempStorage_> {};

//---------------------------------------------------------------------
// Per-thread fields
//---------------------------------------------------------------------

TempStorage & storage;
TempStorage_ &storage;
WrappedKeysInputIteratorT d_keys_in;
KeyT* d_keys_prev_in;
WrappedValuesInputIteratorT d_values_in;
ValuesOutputIteratorT d_values_out;
InequalityWrapper<EqualityOp> inequality_op;
Expand Down Expand Up @@ -364,19 +368,27 @@ struct AgentScanByKey
}
else
{
KeyT tile_pred_key = (threadIdx.x == 0) ? d_keys_in[tile_base - 1] : KeyT();
BlockDiscontinuityKeysT(storage.scan_storage.discontinuity)
.FlagHeads(segment_flags, keys, inequality_op, tile_pred_key);

// Zip values and segment_flags
ZipValuesAndFlags<IS_LAST_TILE>(num_remaining,
values,
segment_flags,
scan_items);

SizeValuePairT tile_aggregate;
TilePrefixCallbackT prefix_op(tile_state, storage.scan_storage.prefix, pair_scan_op, tile_idx);
ScanTile(scan_items, tile_aggregate, prefix_op, Int2Type<IS_INCLUSIVE>());
KeyT tile_pred_key = (threadIdx.x == 0) ? d_keys_prev_in[tile_idx]
: KeyT();

BlockDiscontinuityKeysT(storage.scan_storage.discontinuity)
.FlagHeads(segment_flags, keys, inequality_op, tile_pred_key);

// Zip values and segment_flags
ZipValuesAndFlags<IS_LAST_TILE>(num_remaining,
values,
segment_flags,
scan_items);

SizeValuePairT tile_aggregate;
TilePrefixCallbackT prefix_op(tile_state,
storage.scan_storage.prefix,
pair_scan_op,
tile_idx);
ScanTile(scan_items,
tile_aggregate,
prefix_op,
Int2Type<IS_INCLUSIVE>());
}

CTA_SYNC();
Expand Down Expand Up @@ -408,14 +420,16 @@ struct AgentScanByKey
AgentScanByKey(
TempStorage & storage,
KeysInputIteratorT d_keys_in,
KeyT * d_keys_prev_in,
ValuesInputIteratorT d_values_in,
ValuesOutputIteratorT d_values_out,
EqualityOp equality_op,
ScanOpT scan_op,
InitValueT init_value)
:
storage(storage),
storage(storage.Alias()),
d_keys_in(d_keys_in),
d_keys_prev_in(d_keys_prev_in),
d_values_in(d_values_in),
d_values_out(d_values_out),
inequality_op(equality_op),
Expand Down
3 changes: 1 addition & 2 deletions cub/agent/agent_segment_fixup.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,7 @@ struct AgentSegmentFixup
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,

// Whether or not do fixup using RLE + global atomics
USE_ATOMIC_FIXUP = (CUB_PTX_ARCH >= 350) &&
(std::is_same<ValueT, float>::value ||
USE_ATOMIC_FIXUP = (std::is_same<ValueT, float>::value ||
std::is_same<ValueT, int>::value ||
std::is_same<ValueT, unsigned int>::value ||
std::is_same<ValueT, unsigned long long>::value),
Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_spmv_orig.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ template <
typename OffsetT, ///< Signed integer type for sequence offsets
bool HAS_ALPHA, ///< Whether the input parameter \p alpha is 1
bool HAS_BETA, ///< Whether the input parameter \p beta is 0
int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability
int LEGACY_PTX_ARCH = 0> ///< PTX compute capability (unused)
struct AgentSpmv
{
//---------------------------------------------------------------------
Expand Down
33 changes: 24 additions & 9 deletions cub/agent/agent_sub_warp_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
#include <cub/warp/warp_merge_sort.cuh>
#include <cub/warp/warp_store.cuh>

#include <nv/target>

#include <thrust/system/cuda/detail/core/util.h>


Expand Down Expand Up @@ -108,6 +110,23 @@ class AgentSubWarpSort
{
template <typename T>
__device__ bool operator()(T lhs, T rhs)
{
return this->impl(lhs, rhs);
}

#if defined(__CUDA_FP16_TYPES_EXIST__)
__device__ bool operator()(__half lhs, __half rhs)
{
// Need to explicitly cast to float for SM <= 52.
NV_IF_TARGET(NV_PROVIDES_SM_53,
(return this->impl(lhs, rhs);),
(return this->impl(__half2float(lhs), __half2float(rhs));));
}
#endif

private:
template <typename T>
__device__ bool impl(T lhs, T rhs)
{
if (IS_DESCENDING)
{
Expand All @@ -118,19 +137,15 @@ class AgentSubWarpSort
return lhs < rhs;
}
}

#if defined(__CUDA_FP16_TYPES_EXIST__) && (CUB_PTX_ARCH < 530)
__device__ bool operator()(__half lhs, __half rhs)
{
return (*this)(__half2float(lhs), __half2float(rhs));
}
#endif
};

#if defined(__CUDA_FP16_TYPES_EXIST__) && (CUB_PTX_ARCH < 530)
#if defined(__CUDA_FP16_TYPES_EXIST__)
__device__ static bool equal(__half lhs, __half rhs)
{
return __half2float(lhs) == __half2float(rhs);
// Need to explicitly cast to float for SM <= 52.
NV_IF_TARGET(NV_PROVIDES_SM_53,
(return lhs == rhs;),
(return __half2float(lhs) == __half2float(rhs);));
}
#endif

Expand Down
4 changes: 2 additions & 2 deletions cub/agent/single_pass_scan_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -666,11 +666,11 @@ template <
typename T,
typename ScanOpT,
typename ScanTileStateT,
int PTX_ARCH = CUB_PTX_ARCH>
int LEGACY_PTX_ARCH = 0>
struct TilePrefixCallbackOp
{
// Parameterized warp reduce
typedef WarpReduce<T, CUB_PTX_WARP_THREADS, PTX_ARCH> WarpReduceT;
typedef WarpReduce<T, CUB_PTX_WARP_THREADS> WarpReduceT;

// Temporary storage type
struct _TempStorage
Expand Down
7 changes: 3 additions & 4 deletions cub/block/block_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@

CUB_NAMESPACE_BEGIN


/**
* @brief BlockAdjacentDifference provides
* [<em>collective</em>](index.html#sec0) methods for computing the
Expand Down Expand Up @@ -125,9 +124,9 @@ CUB_NAMESPACE_BEGIN
*/
template <typename T,
int BLOCK_DIM_X,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int LEGACY_PTX_ARCH = 0>
class BlockAdjacentDifference
{
private:
Expand Down
4 changes: 2 additions & 2 deletions cub/block/block_discontinuity.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ CUB_NAMESPACE_BEGIN
* \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
* \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
* \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
* \tparam PTX_ARCH <b>[optional]</b> \ptxversion
* \tparam LEGACY_PTX_ARCH <b>[optional]</b> Unused.
*
* \par Overview
* - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
Expand Down Expand Up @@ -107,7 +107,7 @@ template <
int BLOCK_DIM_X,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
int LEGACY_PTX_ARCH = 0>
class BlockDiscontinuity
{
private:
Expand Down
9 changes: 4 additions & 5 deletions cub/block/block_exchange.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ CUB_NAMESPACE_BEGIN
* \tparam WARP_TIME_SLICING <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false)
* \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
* \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
* \tparam PTX_ARCH <b>[optional]</b> \ptxversion
* \tparam LEGACY_PTX_ARCH <b>[optional]</b> Unused.
*
* \par Overview
* - It is commonplace for blocks of threads to rearrange data items between
Expand Down Expand Up @@ -114,7 +114,7 @@ template <
bool WARP_TIME_SLICING = false,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
int LEGACY_PTX_ARCH = 0>
class BlockExchange
{
private:
Expand All @@ -129,11 +129,11 @@ private:
/// The thread block size in threads
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,

LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH),
LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0),
WARP_THREADS = 1 << LOG_WARP_THREADS,
WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,

LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH),
LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0),
SMEM_BANKS = 1 << LOG_SMEM_BANKS,

TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
Expand Down Expand Up @@ -1126,4 +1126,3 @@ public:


CUB_NAMESPACE_END

Loading

0 comments on commit a68b45e

Please sign in to comment.