From 9b50753fba2f4a5d31954345b315f8b330591aac Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Tue, 2 Aug 2022 01:27:59 +0200 Subject: [PATCH 1/5] Fix begin_bit == end_bit == 0 for device-wide and segmented sort. - Copy if begin_bit == end_bit, but overwrite not allowed - Fix style - When begin_bit == end_bit and double-buffering, don't do any sorting work - Uncommented segmented sort test - begin_bit == end_bit == 0 for upsweep/downsweep and segmented sort - Fixed begin_bit == end_bit == 0 case --- cub/device/dispatch/dispatch_radix_sort.cuh | 88 +++++++++++++++------ test/test_device_radix_sort.cu | 8 +- 2 files changed, 71 insertions(+), 25 deletions(-) diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh index c070fdd3d1..af8687b224 100644 --- a/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/device/dispatch/dispatch_radix_sort.cuh @@ -912,12 +912,12 @@ struct DeviceRadixSortPolicy struct Policy800 : ChainedPolicy<800, Policy800, Policy700> { enum { - PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, - SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, - SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, - ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t), - ONESWEEP_RADIX_BITS = 8, - OFFSET_64BIT = sizeof(OffsetT) == 8, + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, + SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, + SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, + ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t), + ONESWEEP_RADIX_BITS = 8, + OFFSET_64BIT = sizeof(OffsetT) == 8, }; // Histogram policy @@ -1366,7 +1366,7 @@ struct DispatchRadixSort : ValueT* d_values_tmp2 = (ValueT*)allocations[3]; AtomicOffsetT* d_ctrs = (AtomicOffsetT*)allocations[4]; - do { + do { // initialization if (CubDebug(error = cudaMemsetAsync( d_ctrs, 0, num_portions * num_passes * sizeof(AtomicOffsetT), stream))) break; @@ -1498,6 +1498,8 @@ struct DispatchRadixSort : } } + if (CubDebug(error)) break; + // use the temporary buffers if no overwrite is allowed if (!is_overwrite_okay && pass == 0) { @@ -1671,6 +1673,42 @@ struct DispatchRadixSort : return InvokeOnesweep(); } + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokeCopy() + { + // is_overwrite_okay == false here + // Return the number of temporary bytes if requested + if (d_temp_storage == nullptr) + { + temp_storage_bytes = 1; + return cudaSuccess; + } + + // Copy keys + cudaError_t error = cudaSuccess; + error = cudaMemcpyAsync(d_keys.Alternate(), d_keys.Current(), num_items * sizeof(KeyT), + cudaMemcpyDefault, stream); + if (CubDebug(error)) + { + return error; + } + d_keys.selector ^= 1; + + // Copy values if necessary + if (!KEYS_ONLY) + { + error = cudaMemcpyAsync(d_values.Alternate(), d_values.Current(), + num_items * sizeof(ValueT), cudaMemcpyDefault, stream); + if (CubDebug(error)) + { + return error; + } + } + d_values.selector ^= 1; + + return error; + } + /// Invocation template CUB_RUNTIME_FUNCTION __forceinline__ @@ -1679,15 +1717,20 @@ struct DispatchRadixSort : typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT; - // Return if empty problem - if (num_items == 0) + // Return if empty problem, or if no bits to sort and double-buffering is used + if (num_items == 0 || (begin_bit == end_bit && is_overwrite_okay)) { - if (d_temp_storage == nullptr) - { - temp_storage_bytes = 1; - } + if (d_temp_storage == nullptr) + { + temp_storage_bytes = 1; + } + return cudaSuccess; + } - return cudaSuccess; + // Check if simple copy suffices (is_overwrite_okay == false at this point) + if (begin_bit == end_bit) + { + return InvokeCopy(); } // Force kernel code-generation in all compiler passes @@ -2021,7 +2064,7 @@ struct DispatchSegmentedRadixSort : int radix_bits = ActivePolicyT::SegmentedPolicy::RADIX_BITS; int alt_radix_bits = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS; int num_bits = end_bit - begin_bit; - int num_passes = (num_bits + radix_bits - 1) / radix_bits; + int num_passes = CUB_MAX(DivideAndRoundUp(num_bits, radix_bits), 1); bool is_num_passes_odd = num_passes & 1; int max_alt_passes = (num_passes * radix_bits) - num_bits; int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits)); @@ -2082,15 +2125,14 @@ struct DispatchSegmentedRadixSort : { typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT; - // Return if empty problem - if (num_items == 0) + // Return if empty problem, or if no bits to sort and double-buffering is used + if (num_items == 0 || (begin_bit == end_bit && is_overwrite_okay)) { - if (d_temp_storage == nullptr) - { - temp_storage_bytes = 1; - } - - return cudaSuccess; + if (d_temp_storage == nullptr) + { + temp_storage_bytes = 1; + } + return cudaSuccess; } // Force kernel code-generation in all compiler passes diff --git a/test/test_device_radix_sort.cu b/test/test_device_radix_sort.cu index e584977483..0c40fceeae 100644 --- a/test/test_device_radix_sort.cu +++ b/test/test_device_radix_sort.cu @@ -1418,6 +1418,11 @@ void TestBits( printf("Testing key bits [%d,%d)\n", begin_bit, end_bit); fflush(stdout); TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); + // Equal bits + begin_bit = end_bit = 0; + printf("Testing key bits [%d,%d)\n", begin_bit, end_bit); fflush(stdout); + TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); + // Across subword boundaries int mid_bit = sizeof(KeyT) * 4; printf("Testing key bits [%d,%d)\n", mid_bit - 1, mid_bit + 1); fflush(stdout); @@ -1587,7 +1592,7 @@ void TestGen( { if (max_items == ~std::size_t(0)) { - max_items = 9000003; + max_items = 8000003; } if (max_segments < 0) @@ -1650,7 +1655,6 @@ void TestGen( TestSizes(h_keys.get(), large_num_items, max_segments, true); fflush(stdout); } - } From d8ca8c70e7fe153141d320bdf64d2c433fb1c63c Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Tue, 2 Aug 2022 19:40:56 +0200 Subject: [PATCH 2/5] Addressed review comments. --- cub/device/dispatch/dispatch_radix_sort.cuh | 25 ++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh index af8687b224..6a617768d5 100644 --- a/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/device/dispatch/dispatch_radix_sort.cuh @@ -1498,7 +1498,10 @@ struct DispatchRadixSort : } } - if (CubDebug(error)) break; + if (error != cudaSuccess) + { + break; + } // use the temporary buffers if no overwrite is allowed if (!is_overwrite_okay && pass == 0) @@ -1685,24 +1688,40 @@ struct DispatchRadixSort : } // Copy keys + #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG + _CubLog("Invoking async copy of %lld keys on stream %lld\n", (long long)num_items, + (long long)stream); + #endif cudaError_t error = cudaSuccess; error = cudaMemcpyAsync(d_keys.Alternate(), d_keys.Current(), num_items * sizeof(KeyT), - cudaMemcpyDefault, stream); + cudaMemcpyDeviceToDevice, stream); if (CubDebug(error)) { return error; } + if (CubDebug(error = detail::DebugSyncStream(stream))) + { + return error; + } d_keys.selector ^= 1; // Copy values if necessary if (!KEYS_ONLY) { + #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG + _CubLog("Invoking async copy of %lld values on stream %lld\n", + (long long)num_items, (long long)stream); + #endif error = cudaMemcpyAsync(d_values.Alternate(), d_values.Current(), - num_items * sizeof(ValueT), cudaMemcpyDefault, stream); + num_items * sizeof(ValueT), cudaMemcpyDeviceToDevice, stream); if (CubDebug(error)) { return error; } + if (CubDebug(error = detail::DebugSyncStream(stream))) + { + return error; + } } d_values.selector ^= 1; From e9bcf7a1ae25aa52bc21ddfaf2fa85ce7e1f5906 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Wed, 3 Aug 2022 21:55:25 +0200 Subject: [PATCH 3/5] Check for unified addressing before using cudaMemcpyDefault. --- cub/device/dispatch/dispatch_radix_sort.cuh | 9 ++++++--- cub/util_device.cuh | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh index 6a617768d5..0806b8c0d3 100644 --- a/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/device/dispatch/dispatch_radix_sort.cuh @@ -1694,7 +1694,7 @@ struct DispatchRadixSort : #endif cudaError_t error = cudaSuccess; error = cudaMemcpyAsync(d_keys.Alternate(), d_keys.Current(), num_items * sizeof(KeyT), - cudaMemcpyDeviceToDevice, stream); + cudaMemcpyDefault, stream); if (CubDebug(error)) { return error; @@ -1713,7 +1713,7 @@ struct DispatchRadixSort : (long long)num_items, (long long)stream); #endif error = cudaMemcpyAsync(d_values.Alternate(), d_values.Current(), - num_items * sizeof(ValueT), cudaMemcpyDeviceToDevice, stream); + num_items * sizeof(ValueT), cudaMemcpyDefault, stream); if (CubDebug(error)) { return error; @@ -1747,7 +1747,10 @@ struct DispatchRadixSort : } // Check if simple copy suffices (is_overwrite_okay == false at this point) - if (begin_bit == end_bit) + cudaError_t error = cudaSuccess; + bool has_uva = false; + if ((error = HasUVA(has_uva)) != cudaSuccess) return error; + if (begin_bit == end_bit & has_uva) { return InvokeCopy(); } diff --git a/cub/util_device.cuh b/cub/util_device.cuh index 0965f3d654..2820311210 100644 --- a/cub/util_device.cuh +++ b/cub/util_device.cuh @@ -125,6 +125,23 @@ CUB_RUNTIME_FUNCTION inline int CurrentDevice() return device; } +/** \brief Gets whether the current device supports unified addressing */ +CUB_RUNTIME_FUNCTION cudaError_t HasUVA(bool& has_uva) +{ + has_uva = false; + cudaError_t error = cudaSuccess; + int device = -1; + if (CubDebug(error = cudaGetDevice(&device)) != cudaSuccess) return error; + int uva = 0; + if (CubDebug(error = cudaDeviceGetAttribute(&uva, cudaDevAttrUnifiedAddressing, device)) + != cudaSuccess) + { + return error; + } + has_uva = uva == 1; + return error; +} + /** * \brief RAII helper which saves the current device and switches to the * specified device on construction and switches to the saved device on From f094620597bf76a5ab50bbae59b7a3bf76035c2d Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Thu, 4 Aug 2022 21:48:28 +0200 Subject: [PATCH 4/5] Addressed review comments. --- cub/device/dispatch/dispatch_radix_sort.cuh | 13 +++++--- cub/util_device.cuh | 34 ++++++++++----------- 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh index 0806b8c0d3..0165bbc693 100644 --- a/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/device/dispatch/dispatch_radix_sort.cuh @@ -1747,12 +1747,15 @@ struct DispatchRadixSort : } // Check if simple copy suffices (is_overwrite_okay == false at this point) - cudaError_t error = cudaSuccess; - bool has_uva = false; - if ((error = HasUVA(has_uva)) != cudaSuccess) return error; - if (begin_bit == end_bit & has_uva) + if (begin_bit == end_bit) { - return InvokeCopy(); + bool has_uva = false; + cudaError_t error = detail::HasUVA(has_uva); + if (error != cudaSuccess) return error; + if (has_uva) + { + return InvokeCopy(); + } } // Force kernel code-generation in all compiler passes diff --git a/cub/util_device.cuh b/cub/util_device.cuh index 2820311210..4b2065d9f3 100644 --- a/cub/util_device.cuh +++ b/cub/util_device.cuh @@ -125,23 +125,6 @@ CUB_RUNTIME_FUNCTION inline int CurrentDevice() return device; } -/** \brief Gets whether the current device supports unified addressing */ -CUB_RUNTIME_FUNCTION cudaError_t HasUVA(bool& has_uva) -{ - has_uva = false; - cudaError_t error = cudaSuccess; - int device = -1; - if (CubDebug(error = cudaGetDevice(&device)) != cudaSuccess) return error; - int uva = 0; - if (CubDebug(error = cudaDeviceGetAttribute(&uva, cudaDevAttrUnifiedAddressing, device)) - != cudaSuccess) - { - return error; - } - has_uva = uva == 1; - return error; -} - /** * \brief RAII helper which saves the current device and switches to the * specified device on construction and switches to the saved device on @@ -566,6 +549,23 @@ CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream) #endif } +/** \brief Gets whether the current device supports unified addressing */ +CUB_RUNTIME_FUNCTION cudaError_t HasUVA(bool& has_uva) +{ + has_uva = false; + cudaError_t error = cudaSuccess; + int device = -1; + if (CubDebug(error = cudaGetDevice(&device)) != cudaSuccess) return error; + int uva = 0; + if (CubDebug(error = cudaDeviceGetAttribute(&uva, cudaDevAttrUnifiedAddressing, device)) + != cudaSuccess) + { + return error; + } + has_uva = uva == 1; + return error; +} + } // namespace detail /** From 5daac39e01718a87d77d44798ae79626d5672cc7 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Tue, 9 Aug 2022 12:24:26 +0400 Subject: [PATCH 5/5] Add inline to HasUVA --- cub/util_device.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/util_device.cuh b/cub/util_device.cuh index 4b2065d9f3..ef9f120442 100644 --- a/cub/util_device.cuh +++ b/cub/util_device.cuh @@ -550,7 +550,7 @@ CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream) } /** \brief Gets whether the current device supports unified addressing */ -CUB_RUNTIME_FUNCTION cudaError_t HasUVA(bool& has_uva) +CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva) { has_uva = false; cudaError_t error = cudaSuccess;