From 207b66bc5cf3d59de1d1f9807e30dd721d0e15f9 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Tue, 15 Nov 2022 23:25:50 +0400
Subject: [PATCH 1/2] Fix overflow in reduce

---
 cub/agent/agent_reduce.cuh | 22 ++++++++++++++++++++--
 test/test_device_reduce.cu | 14 +++++++-------
 2 files changed, 27 insertions(+), 9 deletions(-)
diff --git a/cub/agent/agent_reduce.cuh b/cub/agent/agent_reduce.cuh
index aa18efa07e..53251aff2a 100644
--- a/cub/agent/agent_reduce.cuh
+++ b/cub/agent/agent_reduce.cuh
@@ -355,7 +355,7 @@ struct AgentReduce
   {
     AccumT thread_aggregate{};
 
-    if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
+    if (even_share.block_end - even_share.block_offset < TILE_ITEMS)
     {
       // First tile isn't full (not all threads have valid items)
       int valid_items = even_share.block_end - even_share.block_offset;
@@ -374,16 +374,34 @@ struct AgentReduce
                       TILE_ITEMS,
                       Int2Type<true>(),
                       can_vectorize);
+
+    // Exit early to handle offset overflow
+    if (even_share.block_end - even_share.block_offset < even_share.block_stride)
+    {
+      // Compute block-wide reduction (all threads have valid items)
+      return BlockReduceT(temp_storage.reduce)
+        .Reduce(thread_aggregate, reduction_op);
+    }
+
     even_share.block_offset += even_share.block_stride;
 
     // Consume subsequent full tiles of input
-    while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
+    while (even_share.block_offset <= even_share.block_end - TILE_ITEMS)
     {
       ConsumeTile<false>(thread_aggregate,
                          even_share.block_offset,
                          TILE_ITEMS,
                          Int2Type<true>(),
                          can_vectorize);
+
+      // Exit early to handle offset overflow
+      if (even_share.block_end - even_share.block_offset < even_share.block_stride)
+      {
+        // Compute block-wide reduction (all threads have valid items)
+        return BlockReduceT(temp_storage.reduce)
+          .Reduce(thread_aggregate, reduction_op);
+      }
+
       even_share.block_offset += even_share.block_stride;
     }
 
diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu
index b3df906d12..113dbd2e77 100644
--- a/test/test_device_reduce.cu
+++ b/test/test_device_reduce.cu
@@ -1333,10 +1333,10 @@ __global__ void InitializeTestAccumulatorTypes(int num_items,
   }
 }
 
-template <typename T>
-void TestBigIndicesHelper(int magnitude)
+template <typename T, 
+          typename OffsetT>
+void TestBigIndicesHelper(OffsetT num_items)
 {
-  const std::size_t num_items = 1ll << magnitude;
   thrust::constant_iterator<T> const_iter(T{1});
   thrust::device_vector<std::size_t> out(1);
   std::size_t* d_out = thrust::raw_pointer_cast(out.data());
@@ -1360,10 +1360,10 @@ void TestBigIndicesHelper(int magnitude)
 template <typename T>
 void TestBigIndices()
 {
-  TestBigIndicesHelper<T>(30);
-  TestBigIndicesHelper<T>(31);
-  TestBigIndicesHelper<T>(32);
-  TestBigIndicesHelper<T>(33);
+  TestBigIndicesHelper<T, std::uint32_t>(1ull << 30);
+  TestBigIndicesHelper<T, std::uint32_t>(1ull << 31);
+  TestBigIndicesHelper<T, std::uint32_t>((1ull << 32) - 1);
+  TestBigIndicesHelper<T, std::uint64_t>(1ull << 33);
 }
 
 void TestAccumulatorTypes()

From a56aed9d0beec0d457772b451efae4fd35386a2a Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Thu, 24 Nov 2022 12:49:02 +0400
Subject: [PATCH 2/2] WAR hanging for reduction with complex operator

---
 cub/agent/agent_reduce.cuh | 111 +++++++++++++++++++++----------------
 1 file changed, 62 insertions(+), 49 deletions(-)

diff --git a/cub/agent/agent_reduce.cuh b/cub/agent/agent_reduce.cuh
index 53251aff2a..4a29d707b3 100644
--- a/cub/agent/agent_reduce.cuh
+++ b/cub/agent/agent_reduce.cuh
@@ -368,53 +368,9 @@ struct AgentReduce
         .Reduce(thread_aggregate, reduction_op, valid_items);
     }
 
-    // At least one full block
-    ConsumeTile<true>(thread_aggregate,
-                      even_share.block_offset,
-                      TILE_ITEMS,
-                      Int2Type<true>(),
-                      can_vectorize);
-
-    // Exit early to handle offset overflow
-    if (even_share.block_end - even_share.block_offset < even_share.block_stride)
-    {
-      // Compute block-wide reduction (all threads have valid items)
-      return BlockReduceT(temp_storage.reduce)
-        .Reduce(thread_aggregate, reduction_op);
-    }
-
-    even_share.block_offset += even_share.block_stride;
-
-    // Consume subsequent full tiles of input
-    while (even_share.block_offset <= even_share.block_end - TILE_ITEMS)
-    {
-      ConsumeTile<false>(thread_aggregate,
-                         even_share.block_offset,
-                         TILE_ITEMS,
-                         Int2Type<true>(),
-                         can_vectorize);
-
-      // Exit early to handle offset overflow
-      if (even_share.block_end - even_share.block_offset < even_share.block_stride)
-      {
-        // Compute block-wide reduction (all threads have valid items)
-        return BlockReduceT(temp_storage.reduce)
-          .Reduce(thread_aggregate, reduction_op);
-      }
-
-      even_share.block_offset += even_share.block_stride;
-    }
-
-    // Consume a partially-full tile
-    if (even_share.block_offset < even_share.block_end)
-    {
-      int valid_items = even_share.block_end - even_share.block_offset;
-      ConsumeTile<false>(thread_aggregate,
-                         even_share.block_offset,
-                         valid_items,
-                         Int2Type<false>(),
-                         can_vectorize);
-    }
+    // Extracting this into a function saves 8% of generated kernel size by allowing to reuse 
+    // the block reduction below. This also workaround hang in nvcc.
+    ConsumeFullTileRange(thread_aggregate, even_share, can_vectorize);
 
     // Compute block-wide reduction (all threads have valid items)
     return BlockReduceT(temp_storage.reduce)
@@ -446,8 +402,7 @@ struct AgentReduce
   __device__ __forceinline__ AccumT
   ConsumeTiles(GridEvenShare<OffsetT> &even_share)
   {
-    // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread
-    // block
+    // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
     even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
 
     return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>()))
@@ -456,6 +411,64 @@ struct AgentReduce
              : ConsumeRange(even_share,
                             Int2Type < false && ATTEMPT_VECTORIZATION > ());
   }
+
+private:
+  /**
+   * @brief Reduce a contiguous segment of input tiles with more than `TILE_ITEMS` elements
+   * @param even_share GridEvenShare descriptor
+   * @param can_vectorize Whether or not we can vectorize loads
+   */
+  template <int CAN_VECTORIZE>
+  __device__ __forceinline__ void
+  ConsumeFullTileRange(AccumT &thread_aggregate,
+                       GridEvenShare<OffsetT> &even_share,
+                       Int2Type<CAN_VECTORIZE> can_vectorize)
+  {
+    // At least one full block
+    ConsumeTile<true>(thread_aggregate,
+                      even_share.block_offset,
+                      TILE_ITEMS,
+                      Int2Type<true>(),
+                      can_vectorize);
+
+    if (even_share.block_end - even_share.block_offset < even_share.block_stride)
+    {
+      // Exit early to handle offset overflow
+      return;
+    }
+
+    even_share.block_offset += even_share.block_stride;
+
+    // Consume subsequent full tiles of input, at least one full tile was processed, so 
+    // `even_share.block_end >= TILE_ITEMS`
+    while (even_share.block_offset <= even_share.block_end - TILE_ITEMS)
+    {
+      ConsumeTile<false>(thread_aggregate,
+                         even_share.block_offset,
+                         TILE_ITEMS,
+                         Int2Type<true>(),
+                         can_vectorize);
+
+      if (even_share.block_end - even_share.block_offset < even_share.block_stride)
+      {
+        // Exit early to handle offset overflow
+        return;
+      }
+
+      even_share.block_offset += even_share.block_stride;
+    }
+
+    // Consume a partially-full tile
+    if (even_share.block_offset < even_share.block_end)
+    {
+      int valid_items = even_share.block_end - even_share.block_offset;
+      ConsumeTile<false>(thread_aggregate,
+                         even_share.block_offset,
+                         valid_items,
+                         Int2Type<false>(),
+                         can_vectorize);
+    }
+  }
 };
 
 CUB_NAMESPACE_END