From 207b66bc5cf3d59de1d1f9807e30dd721d0e15f9 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Tue, 15 Nov 2022 23:25:50 +0400 Subject: [PATCH 1/2] Fix overflow in reduce --- cub/agent/agent_reduce.cuh | 22 ++++++++++++++++++++-- test/test_device_reduce.cu | 14 +++++++------- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/cub/agent/agent_reduce.cuh b/cub/agent/agent_reduce.cuh index aa18efa07e..53251aff2a 100644 --- a/cub/agent/agent_reduce.cuh +++ b/cub/agent/agent_reduce.cuh @@ -355,7 +355,7 @@ struct AgentReduce { AccumT thread_aggregate{}; - if (even_share.block_offset + TILE_ITEMS > even_share.block_end) + if (even_share.block_end - even_share.block_offset < TILE_ITEMS) { // First tile isn't full (not all threads have valid items) int valid_items = even_share.block_end - even_share.block_offset; @@ -374,16 +374,34 @@ struct AgentReduce TILE_ITEMS, Int2Type(), can_vectorize); + + // Exit early to handle offset overflow + if (even_share.block_end - even_share.block_offset < even_share.block_stride) + { + // Compute block-wide reduction (all threads have valid items) + return BlockReduceT(temp_storage.reduce) + .Reduce(thread_aggregate, reduction_op); + } + even_share.block_offset += even_share.block_stride; // Consume subsequent full tiles of input - while (even_share.block_offset + TILE_ITEMS <= even_share.block_end) + while (even_share.block_offset <= even_share.block_end - TILE_ITEMS) { ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize); + + // Exit early to handle offset overflow + if (even_share.block_end - even_share.block_offset < even_share.block_stride) + { + // Compute block-wide reduction (all threads have valid items) + return BlockReduceT(temp_storage.reduce) + .Reduce(thread_aggregate, reduction_op); + } + even_share.block_offset += even_share.block_stride; } diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu index b3df906d12..113dbd2e77 100644 --- a/test/test_device_reduce.cu +++ b/test/test_device_reduce.cu @@ -1333,10 +1333,10 @@ __global__ void InitializeTestAccumulatorTypes(int num_items, } } -template -void TestBigIndicesHelper(int magnitude) +template +void TestBigIndicesHelper(OffsetT num_items) { - const std::size_t num_items = 1ll << magnitude; thrust::constant_iterator const_iter(T{1}); thrust::device_vector out(1); std::size_t* d_out = thrust::raw_pointer_cast(out.data()); @@ -1360,10 +1360,10 @@ void TestBigIndicesHelper(int magnitude) template void TestBigIndices() { - TestBigIndicesHelper(30); - TestBigIndicesHelper(31); - TestBigIndicesHelper(32); - TestBigIndicesHelper(33); + TestBigIndicesHelper(1ull << 30); + TestBigIndicesHelper(1ull << 31); + TestBigIndicesHelper((1ull << 32) - 1); + TestBigIndicesHelper(1ull << 33); } void TestAccumulatorTypes() From a56aed9d0beec0d457772b451efae4fd35386a2a Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Thu, 24 Nov 2022 12:49:02 +0400 Subject: [PATCH 2/2] WAR hanging for reduction with complex operator --- cub/agent/agent_reduce.cuh | 111 +++++++++++++++++++++---------------- 1 file changed, 62 insertions(+), 49 deletions(-) diff --git a/cub/agent/agent_reduce.cuh b/cub/agent/agent_reduce.cuh index 53251aff2a..4a29d707b3 100644 --- a/cub/agent/agent_reduce.cuh +++ b/cub/agent/agent_reduce.cuh @@ -368,53 +368,9 @@ struct AgentReduce .Reduce(thread_aggregate, reduction_op, valid_items); } - // At least one full block - ConsumeTile(thread_aggregate, - even_share.block_offset, - TILE_ITEMS, - Int2Type(), - can_vectorize); - - // Exit early to handle offset overflow - if (even_share.block_end - even_share.block_offset < even_share.block_stride) - { - // Compute block-wide reduction (all threads have valid items) - return BlockReduceT(temp_storage.reduce) - .Reduce(thread_aggregate, reduction_op); - } - - even_share.block_offset += even_share.block_stride; - - // Consume subsequent full tiles of input - while (even_share.block_offset <= even_share.block_end - TILE_ITEMS) - { - ConsumeTile(thread_aggregate, - even_share.block_offset, - TILE_ITEMS, - Int2Type(), - can_vectorize); - - // Exit early to handle offset overflow - if (even_share.block_end - even_share.block_offset < even_share.block_stride) - { - // Compute block-wide reduction (all threads have valid items) - return BlockReduceT(temp_storage.reduce) - .Reduce(thread_aggregate, reduction_op); - } - - even_share.block_offset += even_share.block_stride; - } - - // Consume a partially-full tile - if (even_share.block_offset < even_share.block_end) - { - int valid_items = even_share.block_end - even_share.block_offset; - ConsumeTile(thread_aggregate, - even_share.block_offset, - valid_items, - Int2Type(), - can_vectorize); - } + // Extracting this into a function saves 8% of generated kernel size by allowing to reuse + // the block reduction below. This also workaround hang in nvcc. + ConsumeFullTileRange(thread_aggregate, even_share, can_vectorize); // Compute block-wide reduction (all threads have valid items) return BlockReduceT(temp_storage.reduce) @@ -446,8 +402,7 @@ struct AgentReduce __device__ __forceinline__ AccumT ConsumeTiles(GridEvenShare &even_share) { - // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread - // block + // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block even_share.template BlockInit(); return (IsAligned(d_in, Int2Type())) @@ -456,6 +411,64 @@ struct AgentReduce : ConsumeRange(even_share, Int2Type < false && ATTEMPT_VECTORIZATION > ()); } + +private: + /** + * @brief Reduce a contiguous segment of input tiles with more than `TILE_ITEMS` elements + * @param even_share GridEvenShare descriptor + * @param can_vectorize Whether or not we can vectorize loads + */ + template + __device__ __forceinline__ void + ConsumeFullTileRange(AccumT &thread_aggregate, + GridEvenShare &even_share, + Int2Type can_vectorize) + { + // At least one full block + ConsumeTile(thread_aggregate, + even_share.block_offset, + TILE_ITEMS, + Int2Type(), + can_vectorize); + + if (even_share.block_end - even_share.block_offset < even_share.block_stride) + { + // Exit early to handle offset overflow + return; + } + + even_share.block_offset += even_share.block_stride; + + // Consume subsequent full tiles of input, at least one full tile was processed, so + // `even_share.block_end >= TILE_ITEMS` + while (even_share.block_offset <= even_share.block_end - TILE_ITEMS) + { + ConsumeTile(thread_aggregate, + even_share.block_offset, + TILE_ITEMS, + Int2Type(), + can_vectorize); + + if (even_share.block_end - even_share.block_offset < even_share.block_stride) + { + // Exit early to handle offset overflow + return; + } + + even_share.block_offset += even_share.block_stride; + } + + // Consume a partially-full tile + if (even_share.block_offset < even_share.block_end) + { + int valid_items = even_share.block_end - even_share.block_offset; + ConsumeTile(thread_aggregate, + even_share.block_offset, + valid_items, + Int2Type(), + can_vectorize); + } + } }; CUB_NAMESPACE_END