NVIDIA · gevtushenko · Nov 25, 2022 · Nov 15, 2022 · Nov 24, 2022 · canonizer
diff --git a/cub/agent/agent_reduce.cuh b/cub/agent/agent_reduce.cuh
@@ -355,7 +355,7 @@ struct AgentReduce
   {
     AccumT thread_aggregate{};
 
-    if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
+    if (even_share.block_end - even_share.block_offset < TILE_ITEMS)
     {
       // First tile isn't full (not all threads have valid items)
       int valid_items = even_share.block_end - even_share.block_offset;
@@ -368,35 +368,9 @@ struct AgentReduce
         .Reduce(thread_aggregate, reduction_op, valid_items);
     }
 
-    // At least one full block
-    ConsumeTile<true>(thread_aggregate,
-                      even_share.block_offset,
-                      TILE_ITEMS,
-                      Int2Type<true>(),
-                      can_vectorize);
-    even_share.block_offset += even_share.block_stride;
-
-    // Consume subsequent full tiles of input
-    while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
-    {
-      ConsumeTile<false>(thread_aggregate,
-                         even_share.block_offset,
-                         TILE_ITEMS,
-                         Int2Type<true>(),
-                         can_vectorize);
-      even_share.block_offset += even_share.block_stride;
-    }
-
-    // Consume a partially-full tile
-    if (even_share.block_offset < even_share.block_end)
-    {
-      int valid_items = even_share.block_end - even_share.block_offset;
-      ConsumeTile<false>(thread_aggregate,
-                         even_share.block_offset,
-                         valid_items,
-                         Int2Type<false>(),
-                         can_vectorize);
-    }
+    // Extracting this into a function saves 8% of generated kernel size by allowing to reuse 
+    // the block reduction below. This also workaround hang in nvcc.
+    ConsumeFullTileRange(thread_aggregate, even_share, can_vectorize);
 
     // Compute block-wide reduction (all threads have valid items)
     return BlockReduceT(temp_storage.reduce)
@@ -428,8 +402,7 @@ struct AgentReduce
   __device__ __forceinline__ AccumT
   ConsumeTiles(GridEvenShare<OffsetT> &even_share)
   {
-    // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread
-    // block
+    // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
     even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
 
     return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>()))
@@ -438,6 +411,64 @@ struct AgentReduce
              : ConsumeRange(even_share,
                             Int2Type < false && ATTEMPT_VECTORIZATION > ());
   }
+
+private:
+  /**
+   * @brief Reduce a contiguous segment of input tiles with more than `TILE_ITEMS` elements
+   * @param even_share GridEvenShare descriptor
+   * @param can_vectorize Whether or not we can vectorize loads
+   */
+  template <int CAN_VECTORIZE>
+  __device__ __forceinline__ void
+  ConsumeFullTileRange(AccumT &thread_aggregate,
+                       GridEvenShare<OffsetT> &even_share,
+                       Int2Type<CAN_VECTORIZE> can_vectorize)
+  {
+    // At least one full block
+    ConsumeTile<true>(thread_aggregate,
+                      even_share.block_offset,
+                      TILE_ITEMS,
+                      Int2Type<true>(),
+                      can_vectorize);
+
+    if (even_share.block_end - even_share.block_offset < even_share.block_stride)
+    {
+      // Exit early to handle offset overflow
+      return;
+    }
+
+    even_share.block_offset += even_share.block_stride;
+
+    // Consume subsequent full tiles of input, at least one full tile was processed, so 
+    // `even_share.block_end >= TILE_ITEMS`
+    while (even_share.block_offset <= even_share.block_end - TILE_ITEMS)
+    {
+      ConsumeTile<false>(thread_aggregate,
+                         even_share.block_offset,
+                         TILE_ITEMS,
+                         Int2Type<true>(),
+                         can_vectorize);
+
+      if (even_share.block_end - even_share.block_offset < even_share.block_stride)
+      {
+        // Exit early to handle offset overflow
+        return;
+      }
+
+      even_share.block_offset += even_share.block_stride;
+    }
+
+    // Consume a partially-full tile
+    if (even_share.block_offset < even_share.block_end)
+    {
+      int valid_items = even_share.block_end - even_share.block_offset;
+      ConsumeTile<false>(thread_aggregate,
+                         even_share.block_offset,
+                         valid_items,
+                         Int2Type<false>(),
+                         can_vectorize);
+    }
+  }
 };
 
 CUB_NAMESPACE_END

diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu
@@ -1333,10 +1333,10 @@ __global__ void InitializeTestAccumulatorTypes(int num_items,
   }
 }
 
-template <typename T>
-void TestBigIndicesHelper(int magnitude)
+template <typename T, 
+          typename OffsetT>
+void TestBigIndicesHelper(OffsetT num_items)
 {
-  const std::size_t num_items = 1ll << magnitude;
   thrust::constant_iterator<T> const_iter(T{1});
   thrust::device_vector<std::size_t> out(1);
   std::size_t* d_out = thrust::raw_pointer_cast(out.data());
@@ -1360,10 +1360,10 @@ void TestBigIndicesHelper(int magnitude)
 template <typename T>
 void TestBigIndices()
 {
-  TestBigIndicesHelper<T>(30);
-  TestBigIndicesHelper<T>(31);
-  TestBigIndicesHelper<T>(32);
-  TestBigIndicesHelper<T>(33);
+  TestBigIndicesHelper<T, std::uint32_t>(1ull << 30);
+  TestBigIndicesHelper<T, std::uint32_t>(1ull << 31);
+  TestBigIndicesHelper<T, std::uint32_t>((1ull << 32) - 1);
+  TestBigIndicesHelper<T, std::uint64_t>(1ull << 33);
 }
 
 void TestAccumulatorTypes()