Merge branch 'main' into onesweep-begin-end-bit

NVIDIA · May 31, 2022 · a68b45e · a68b45e
2 parents 398e50f + 4e4ea96
commit a68b45e
Show file tree

Hide file tree

Showing 75 changed files with 5,278 additions and 3,106 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,55 @@
+# CUB 1.17.0
+
+## Summary
+
+CUB 1.17.0 is the final minor release of the 1.X series. It provides a variety
+of bug fixes and miscellaneous enhancements, detailed below.
+
+## Known Issues
+
+### "Run-to-run" Determinism Broken
+
+Several CUB device algorithms are documented to provide deterministic results
+(per device) for non-associative reduction operators (e.g. floating-point
+addition). Unfortunately, the implementations of these algorithms contain
+performance optimizations that violate this guarantee.
+The `DeviceReduce::ReduceByKey` and `DeviceScan` algorithms are known to be
+affected. We’re currently evaluating the scope and impact of correcting this in
+a future CUB release. See NVIDIA/cub#471 for details.
+
+## Bug Fixes
+
+- NVIDIA/cub#444: Fixed `DeviceSelect` to work with discard iterators and mixed
+  input/output types.
+- NVIDIA/cub#452: Fixed install issue when `CMAKE_INSTALL_LIBDIR` contained
+  nested directories. Thanks to @robertmaynard for this contribution.
+- NVIDIA/cub#462: Fixed bug that produced incorrect results
+  from `DeviceSegmentedSort` on sm_61 and sm_70.
+- NVIDIA/cub#464: Fixed `DeviceSelect::Flagged` so that flags are normalized to
+  0 or 1.
+- NVIDIA/cub#468: Fixed overflow issues in `DeviceRadixSort` given `num_items`
+  close to 2^32. Thanks to @canonizer for this contribution.
+
+## Other Enhancements
+
+- NVIDIA/cub#445: Remove device-sync in `DeviceSegmentedSort` when launched via
+  CDP.
+- NVIDIA/cub#449: Fixed invalid link in documentation. Thanks to @kshitij12345
+  for this contribution.
+- NVIDIA/cub#450: `BlockDiscontinuity`: Replaced recursive-template loop
+  unrolling with `#pragma unroll`. Thanks to @kshitij12345 for this
+  contribution.
+- NVIDIA/cub#451: Replaced the deprecated `TexRefInputIterator` implementation
+  with an alias to `TexObjInputIterator`. This fully removes all usages of the
+  deprecated CUDA texture reference APIs from CUB.
+- NVIDIA/cub#456: `BlockAdjacentDifference`: Replaced recursive-template loop
+  unrolling with `#pragma unroll`. Thanks to @kshitij12345 for this
+  contribution.
+- NVIDIA/cub#466: `cub::DeviceAdjacentDifference` API has been updated to use
+  the new `OffsetT` deduction approach described in NVIDIA/cub#212.
+- NVIDIA/cub#470: Fix several doxygen-related warnings. Thanks to @karthikeyann
+  for this contribution.
+
 # CUB 1.16.0
 
 ## Summary

diff --git a/README.md b/README.md
@@ -100,6 +100,7 @@ See the [changelog](CHANGELOG.md) for details about specific releases.
 
 | CUB Release               | Included In                             |
 | ------------------------- | --------------------------------------- |
+| 1.17.0                    | TBD                                     |
 | 1.16.0                    | TBD                                     |
 | 1.15.0                    | NVIDIA HPC SDK 22.1 & CUDA Toolkit 11.6 |
 | 1.14.0                    | NVIDIA HPC SDK 21.9                     |

diff --git a/cub/agent/agent_histogram.cuh b/cub/agent/agent_histogram.cuh
@@ -103,7 +103,7 @@ template <
     typename    PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
     typename    OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
     typename    OffsetT,                        ///< Signed integer type for global offsets
-    int         PTX_ARCH = CUB_PTX_ARCH>        ///< PTX compute capability
+    int         LEGACY_PTX_ARCH = 0>            ///< PTX compute capability (unused)
 struct AgentHistogram
 {
     //---------------------------------------------------------------------
@@ -562,15 +562,10 @@ struct AgentHistogram
             is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
 
         // Accumulate samples
-#if CUB_PTX_ARCH >= 120
         if (prefer_smem)
             AccumulateSmemPixels(samples, is_valid);
         else
             AccumulateGmemPixels(samples, is_valid);
-#else
-        AccumulateGmemPixels(samples, is_valid);
-#endif
-
     }
 
 

diff --git a/cub/agent/agent_rle.cuh b/cub/agent/agent_rle.cuh
@@ -117,7 +117,7 @@ struct AgentRle
     // Constants
     enum
     {
-        WARP_THREADS            = CUB_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS            = CUB_WARP_THREADS(0),
         BLOCK_THREADS           = AgentRlePolicyT::BLOCK_THREADS,
         ITEMS_PER_THREAD        = AgentRlePolicyT::ITEMS_PER_THREAD,
         WARP_ITEMS              = WARP_THREADS * ITEMS_PER_THREAD,

diff --git a/cub/agent/agent_scan_by_key.cuh b/cub/agent/agent_scan_by_key.cuh
@@ -32,15 +32,16 @@
 
 #pragma once
 
-#include <iterator>
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/block/block_discontinuity.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/config.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/util_type.cuh>
 
-#include "single_pass_scan_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_discontinuity.cuh"
-#include "../config.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
+#include <iterator>
 
 
 CUB_NAMESPACE_BEGIN
@@ -138,7 +139,7 @@ struct AgentScanByKey
     using TilePrefixCallbackT = TilePrefixCallbackOp<SizeValuePairT, ReduceBySegmentOpT, ScanTileStateT>;
     using BlockScanT = BlockScan<SizeValuePairT, BLOCK_THREADS, AgentScanByKeyPolicyT::SCAN_ALGORITHM, 1, 1>;
 
-    union TempStorage
+    union TempStorage_
     {
         struct ScanStorage
         {
@@ -152,12 +153,15 @@ struct AgentScanByKey
         typename BlockStoreValuesT::TempStorage store_values;
     };
 
+    struct TempStorage : cub::Uninitialized<TempStorage_> {};
+
     //---------------------------------------------------------------------
     // Per-thread fields
     //---------------------------------------------------------------------
 
-    TempStorage &                 storage;
+    TempStorage_                 &storage;
     WrappedKeysInputIteratorT     d_keys_in;
+    KeyT*                         d_keys_prev_in;
     WrappedValuesInputIteratorT   d_values_in;
     ValuesOutputIteratorT         d_values_out;
     InequalityWrapper<EqualityOp> inequality_op;
@@ -364,19 +368,27 @@ struct AgentScanByKey
         }
         else
         {
-            KeyT tile_pred_key = (threadIdx.x == 0) ? d_keys_in[tile_base - 1] : KeyT();
-            BlockDiscontinuityKeysT(storage.scan_storage.discontinuity)
-                .FlagHeads(segment_flags, keys, inequality_op, tile_pred_key);
-
-            // Zip values and segment_flags
-            ZipValuesAndFlags<IS_LAST_TILE>(num_remaining,
-                                            values,
-                                            segment_flags,
-                                            scan_items);
-
-            SizeValuePairT  tile_aggregate;
-            TilePrefixCallbackT prefix_op(tile_state, storage.scan_storage.prefix, pair_scan_op, tile_idx);
-            ScanTile(scan_items, tile_aggregate, prefix_op, Int2Type<IS_INCLUSIVE>());
+          KeyT tile_pred_key = (threadIdx.x == 0) ? d_keys_prev_in[tile_idx]
+                                                  : KeyT();
+
+          BlockDiscontinuityKeysT(storage.scan_storage.discontinuity)
+            .FlagHeads(segment_flags, keys, inequality_op, tile_pred_key);
+
+          // Zip values and segment_flags
+          ZipValuesAndFlags<IS_LAST_TILE>(num_remaining,
+                                          values,
+                                          segment_flags,
+                                          scan_items);
+
+          SizeValuePairT tile_aggregate;
+          TilePrefixCallbackT prefix_op(tile_state,
+                                        storage.scan_storage.prefix,
+                                        pair_scan_op,
+                                        tile_idx);
+          ScanTile(scan_items,
+                   tile_aggregate,
+                   prefix_op,
+                   Int2Type<IS_INCLUSIVE>());
         }
 
         CTA_SYNC();
@@ -408,14 +420,16 @@ struct AgentScanByKey
     AgentScanByKey(
         TempStorage &         storage,
         KeysInputIteratorT    d_keys_in,
+        KeyT *                d_keys_prev_in,
         ValuesInputIteratorT  d_values_in,
         ValuesOutputIteratorT d_values_out,
         EqualityOp            equality_op,
         ScanOpT               scan_op,
         InitValueT            init_value)
     : 
-        storage(storage),
+        storage(storage.Alias()),
         d_keys_in(d_keys_in),
+        d_keys_prev_in(d_keys_prev_in),
         d_values_in(d_values_in),
         d_values_out(d_values_out),
         inequality_op(equality_op),

diff --git a/cub/agent/agent_segment_fixup.cuh b/cub/agent/agent_segment_fixup.cuh
@@ -111,8 +111,7 @@ struct AgentSegmentFixup
       TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD,
 
       // Whether or not do fixup using RLE + global atomics
-      USE_ATOMIC_FIXUP = (CUB_PTX_ARCH >= 350) &&
-                         (std::is_same<ValueT, float>::value ||
+      USE_ATOMIC_FIXUP = (std::is_same<ValueT, float>::value ||
                           std::is_same<ValueT, int>::value ||
                           std::is_same<ValueT, unsigned int>::value ||
                           std::is_same<ValueT, unsigned long long>::value),

diff --git a/cub/agent/agent_spmv_orig.cuh b/cub/agent/agent_spmv_orig.cuh
@@ -115,7 +115,7 @@ template <
     typename    OffsetT,                    ///< Signed integer type for sequence offsets
     bool        HAS_ALPHA,                  ///< Whether the input parameter \p alpha is 1
     bool        HAS_BETA,                   ///< Whether the input parameter \p beta is 0
-    int         PTX_ARCH = CUB_PTX_ARCH>    ///< PTX compute capability
+    int         LEGACY_PTX_ARCH = 0>        ///< PTX compute capability (unused)
 struct AgentSpmv
 {
     //---------------------------------------------------------------------

diff --git a/cub/agent/agent_sub_warp_merge_sort.cuh b/cub/agent/agent_sub_warp_merge_sort.cuh
@@ -33,6 +33,8 @@
 #include <cub/warp/warp_merge_sort.cuh>
 #include <cub/warp/warp_store.cuh>
 
+#include <nv/target>
+
 #include <thrust/system/cuda/detail/core/util.h>
 
 
@@ -108,6 +110,23 @@ class AgentSubWarpSort
   {
     template <typename T>
     __device__ bool operator()(T lhs, T rhs)
+    {
+      return this->impl(lhs, rhs);
+    }
+
+#if defined(__CUDA_FP16_TYPES_EXIST__)
+    __device__ bool operator()(__half lhs, __half rhs)
+    {
+      // Need to explicitly cast to float for SM <= 52.
+      NV_IF_TARGET(NV_PROVIDES_SM_53,
+                   (return this->impl(lhs, rhs);),
+                   (return this->impl(__half2float(lhs), __half2float(rhs));));
+    }
+#endif
+
+  private:
+    template <typename T>
+    __device__ bool impl(T lhs, T rhs)
     {
       if (IS_DESCENDING)
       {
@@ -118,19 +137,15 @@ class AgentSubWarpSort
         return lhs < rhs;
       }
     }
-
-#if defined(__CUDA_FP16_TYPES_EXIST__) && (CUB_PTX_ARCH < 530)
-    __device__ bool operator()(__half lhs, __half rhs)
-    {
-      return (*this)(__half2float(lhs), __half2float(rhs));
-    }
-#endif
   };
 
-#if defined(__CUDA_FP16_TYPES_EXIST__) && (CUB_PTX_ARCH < 530)
+#if defined(__CUDA_FP16_TYPES_EXIST__)
   __device__ static bool equal(__half lhs, __half rhs)
   {
-    return __half2float(lhs) == __half2float(rhs);
+    // Need to explicitly cast to float for SM <= 52.
+    NV_IF_TARGET(NV_PROVIDES_SM_53,
+                 (return lhs == rhs;),
+                 (return __half2float(lhs) == __half2float(rhs);));
   }
 #endif
 

diff --git a/cub/agent/single_pass_scan_operators.cuh b/cub/agent/single_pass_scan_operators.cuh
@@ -666,11 +666,11 @@ template <
     typename    T,
     typename    ScanOpT,
     typename    ScanTileStateT,
-    int         PTX_ARCH = CUB_PTX_ARCH>
+    int         LEGACY_PTX_ARCH = 0>
 struct TilePrefixCallbackOp
 {
     // Parameterized warp reduce
-    typedef WarpReduce<T, CUB_PTX_WARP_THREADS, PTX_ARCH> WarpReduceT;
+    typedef WarpReduce<T, CUB_PTX_WARP_THREADS> WarpReduceT;
 
     // Temporary storage type
     struct _TempStorage

diff --git a/cub/block/block_adjacent_difference.cuh b/cub/block/block_adjacent_difference.cuh
@@ -41,7 +41,6 @@
 
 CUB_NAMESPACE_BEGIN
 
-
 /**
  * @brief BlockAdjacentDifference provides
  *        [<em>collective</em>](index.html#sec0) methods for computing the
@@ -125,9 +124,9 @@ CUB_NAMESPACE_BEGIN
  */
 template <typename T,
           int BLOCK_DIM_X,
-          int BLOCK_DIM_Y = 1,
-          int BLOCK_DIM_Z = 1,
-          int PTX_ARCH    = CUB_PTX_ARCH>
+          int BLOCK_DIM_Y     = 1,
+          int BLOCK_DIM_Z     = 1,
+          int LEGACY_PTX_ARCH = 0>
 class BlockAdjacentDifference
 {
 private:

diff --git a/cub/block/block_discontinuity.cuh b/cub/block/block_discontinuity.cuh
@@ -47,7 +47,7 @@ CUB_NAMESPACE_BEGIN
  * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
  * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
  * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ * \tparam LEGACY_PTX_ARCH  <b>[optional]</b> Unused.
  *
  * \par Overview
  * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
@@ -107,7 +107,7 @@ template <
     int         BLOCK_DIM_X,
     int         BLOCK_DIM_Y     = 1,
     int         BLOCK_DIM_Z     = 1,
-    int         PTX_ARCH        = CUB_PTX_ARCH>
+    int         LEGACY_PTX_ARCH = 0>
 class BlockDiscontinuity
 {
 private:

diff --git a/cub/block/block_exchange.cuh b/cub/block/block_exchange.cuh
@@ -50,7 +50,7 @@ CUB_NAMESPACE_BEGIN
  * \tparam WARP_TIME_SLICING    <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds.  Yields a smaller memory footprint at the expense of decreased parallelism.  (Default: false)
  * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
  * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ * \tparam LEGACY_PTX_ARCH      <b>[optional]</b> Unused.
  *
  * \par Overview
  * - It is commonplace for blocks of threads to rearrange data items between
@@ -114,7 +114,7 @@ template <
     bool        WARP_TIME_SLICING   = false,
     int         BLOCK_DIM_Y         = 1,
     int         BLOCK_DIM_Z         = 1,
-    int         PTX_ARCH            = CUB_PTX_ARCH>
+    int         LEGACY_PTX_ARCH     = 0>
 class BlockExchange
 {
 private:
@@ -129,11 +129,11 @@ private:
         /// The thread block size in threads
         BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
 
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(0),
         WARP_THREADS                = 1 << LOG_WARP_THREADS,
         WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
 
-        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
+        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(0),
         SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
 
         TILE_ITEMS                  = BLOCK_THREADS * ITEMS_PER_THREAD,
@@ -1126,4 +1126,3 @@ public:
 
 
 CUB_NAMESPACE_END
-