diff --git a/.clangd b/.clangd index c3edf49bcc..e60a0cb1a7 100644 --- a/.clangd +++ b/.clangd @@ -32,6 +32,7 @@ CompileFlags: # report all errors - "-ferror-limit=0" - "-ftemplate-backtrace-limit=0" + - "-stdlib=libc++" Remove: - -stdpar # strip CUDA fatbin args diff --git a/cub/agent/single_pass_scan_operators.cuh b/cub/agent/single_pass_scan_operators.cuh index 63fbd7c85e..e5d1566213 100644 --- a/cub/agent/single_pass_scan_operators.cuh +++ b/cub/agent/single_pass_scan_operators.cuh @@ -36,12 +36,16 @@ #include #include +#include +#include #include #include #include #include #include +#include + CUB_NAMESPACE_BEGIN @@ -106,6 +110,44 @@ enum ScanTileStatus SCAN_TILE_INCLUSIVE, // Inclusive tile prefix is available }; +namespace detail +{ + +template +__device__ __forceinline__ void delay() +{ + NV_IF_TARGET(NV_PROVIDES_SM_70, + (if (Delay > 0) + { + if (gridDim.x < GridThreshold) + { + __threadfence_block(); + } + else + { + __nanosleep(Delay); + } + })); +} + +template +__device__ __forceinline__ void delay_or_prevent_hoisting() +{ + NV_IF_TARGET(NV_PROVIDES_SM_70, + (delay();), + (__threadfence_block();)); +} + +template +__device__ __forceinline__ void delay_on_dc_gpu_or_prevent_hoisting() +{ + NV_DISPATCH_TARGET( + NV_IS_EXACTLY_SM_80, (delay();), + NV_PROVIDES_SM_70, (delay< 0, GridThreshold>();), + NV_IS_DEVICE, (__threadfence_block();)); +} + +} /** * Tile status interface. @@ -127,20 +169,20 @@ struct ScanTileState // Status word type using StatusWord = cub::detail::conditional_t< sizeof(T) == 8, - long long, + unsigned long long, cub::detail::conditional_t< sizeof(T) == 4, - int, - cub::detail::conditional_t>>; + unsigned int, + cub::detail::conditional_t>>; // Unit word type using TxnWord = cub::detail::conditional_t< sizeof(T) == 8, - longlong2, + ulonglong2, cub::detail::conditional_t< sizeof(T) == 4, - int2, - cub::detail::conditional_t>>; + uint2, + unsigned int>>; // Device word type struct TileDescriptor @@ -230,7 +272,8 @@ struct ScanTileState TxnWord alias; *reinterpret_cast(&alias) = tile_descriptor; - ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); + + detail::store_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); } @@ -245,7 +288,8 @@ struct ScanTileState TxnWord alias; *reinterpret_cast(&alias) = tile_descriptor; - ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); + + detail::store_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); } /** @@ -257,13 +301,18 @@ struct ScanTileState T &value) { TileDescriptor tile_descriptor; - do + { - __threadfence_block(); // prevent hoisting loads from loop - TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); - tile_descriptor = reinterpret_cast(alias); + TxnWord alias = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); + tile_descriptor = reinterpret_cast(alias); + } - } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)); + while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)) + { + detail::delay_or_prevent_hoisting(); + TxnWord alias = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); + tile_descriptor = reinterpret_cast(alias); + } status = tile_descriptor.status; value = tile_descriptor.value; @@ -281,7 +330,7 @@ template struct ScanTileState { // Status word type - typedef char StatusWord; + using StatusWord = unsigned int; // Constants enum @@ -382,12 +431,7 @@ struct ScanTileState { // Update tile inclusive value ThreadStore(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive); - - // Fence - __threadfence(); - - // Update tile status - ThreadStore(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE)); + detail::store_release(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE)); } @@ -398,12 +442,7 @@ struct ScanTileState { // Update tile partial value ThreadStore(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial); - - // Fence - __threadfence(); - - // Update tile status - ThreadStore(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL)); + detail::store_release(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL)); } /** @@ -414,17 +453,21 @@ struct ScanTileState StatusWord &status, T &value) { - do { - status = ThreadLoad(d_tile_status + TILE_STATUS_PADDING + tile_idx); - - __threadfence(); // prevent hoisting loads from loop or loads below above this one + do + { + status = detail::load_relaxed(d_tile_status + TILE_STATUS_PADDING + tile_idx); + __threadfence(); - } while (status == SCAN_TILE_INVALID); + } while (WARP_ANY((status == SCAN_TILE_INVALID), 0xffffffff)); if (status == StatusWord(SCAN_TILE_PARTIAL)) - value = ThreadLoad(d_tile_partial + TILE_STATUS_PADDING + tile_idx); + { + value = ThreadLoad(d_tile_partial + TILE_STATUS_PADDING + tile_idx); + } else - value = ThreadLoad(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx); + { + value = ThreadLoad(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx); + } } }; @@ -471,7 +514,7 @@ template < typename KeyT> struct ReduceByKeyScanTileState { - typedef KeyValuePairKeyValuePairT; + using KeyValuePairT = KeyValuePair; // Constants enum @@ -486,17 +529,17 @@ struct ReduceByKeyScanTileState // Status word type using StatusWord = cub::detail::conditional_t< STATUS_WORD_SIZE == 8, - long long, + unsigned long long, cub::detail::conditional_t< STATUS_WORD_SIZE == 4, - int, - cub::detail::conditional_t>>; + unsigned int, + cub::detail::conditional_t>>; // Status word type using TxnWord = cub::detail::conditional_t< TXN_WORD_SIZE == 16, - longlong2, - cub::detail::conditional_t>; + ulonglong2, + cub::detail::conditional_t>; // Device word type (for when sizeof(ValueT) == sizeof(KeyT)) struct TileDescriptorBigStatus @@ -594,7 +637,8 @@ struct ReduceByKeyScanTileState TxnWord alias; *reinterpret_cast(&alias) = tile_descriptor; - ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); + + detail::store_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); } @@ -610,7 +654,8 @@ struct ReduceByKeyScanTileState TxnWord alias; *reinterpret_cast(&alias) = tile_descriptor; - ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); + + detail::store_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); } /** @@ -637,11 +682,12 @@ struct ReduceByKeyScanTileState // value.key = tile_descriptor.key; TileDescriptor tile_descriptor; + do { - __threadfence_block(); // prevent hoisting loads from loop - TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); - tile_descriptor = reinterpret_cast(alias); + detail::delay_on_dc_gpu_or_prevent_hoisting(); + TxnWord alias = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); + tile_descriptor = reinterpret_cast(alias); } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)); @@ -750,6 +796,7 @@ struct TilePrefixCallbackOp T window_aggregate; // Wait for the warp-wide window of predecessor tiles to become valid + detail::delay<450>(); ProcessWindow(predecessor_idx, predecessor_status, window_aggregate); // The exclusive tile prefix starts out as the current window aggregate diff --git a/cub/detail/strong_load.cuh b/cub/detail/strong_load.cuh new file mode 100644 index 0000000000..12e6672b9e --- /dev/null +++ b/cub/detail/strong_load.cuh @@ -0,0 +1,182 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * @file Utilities for strong memory operations. + */ + +#pragma once + +#include +#include +#include + +#include + +CUB_NAMESPACE_BEGIN + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +namespace detail +{ + +static __device__ __forceinline__ uint4 load_relaxed(uint4 const *ptr) +{ + uint4 retval; + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("ld.relaxed.gpu.v4.u32 {%0, %1, %2, %3}, [%4];" + : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w) + : _CUB_ASM_PTR_(ptr) + : "memory");), + (asm volatile("ld.cg.v4.u32 {%0, %1, %2, %3}, [%4];" + : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w) + : _CUB_ASM_PTR_(ptr) + : "memory");)); + return retval; +} + +static __device__ __forceinline__ ulonglong2 load_relaxed(ulonglong2 const *ptr) +{ + ulonglong2 retval; + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("ld.relaxed.gpu.v2.u64 {%0, %1}, [%2];" + : "=l"(retval.x), "=l"(retval.y) + : _CUB_ASM_PTR_(ptr) + : "memory");), + (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];" + : "=l"(retval.x), "=l"(retval.y) + : _CUB_ASM_PTR_(ptr) + : "memory");)); + return retval; +} + +static __device__ __forceinline__ ushort4 load_relaxed(ushort4 const *ptr) +{ + ushort4 retval; + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("ld.relaxed.gpu.v4.u16 {%0, %1, %2, %3}, [%4];" + : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w) + : _CUB_ASM_PTR_(ptr) + : "memory");), + (asm volatile("ld.cg.v4.u16 {%0, %1, %2, %3}, [%4];" + : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w) + : _CUB_ASM_PTR_(ptr) + : "memory");)); + return retval; +} + +static __device__ __forceinline__ uint2 load_relaxed(uint2 const *ptr) +{ + uint2 retval; + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("ld.relaxed.gpu.v2.u32 {%0, %1}, [%2];" + : "=r"(retval.x), "=r"(retval.y) + : _CUB_ASM_PTR_(ptr) + : "memory");), + (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];" + : "=r"(retval.x), "=r"(retval.y) + : _CUB_ASM_PTR_(ptr) + : "memory");)); + return retval; +} + +static __device__ __forceinline__ unsigned long long load_relaxed(unsigned long long const *ptr) +{ + unsigned long long retval; + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("ld.relaxed.gpu.u64 %0, [%1];" + : "=l"(retval) + : _CUB_ASM_PTR_(ptr) + : "memory");), + (asm volatile("ld.cg.u64 %0, [%1];" + : "=l"(retval) + : _CUB_ASM_PTR_(ptr) + : "memory");)); + return retval; +} + +static __device__ __forceinline__ unsigned int load_relaxed(unsigned int const *ptr) +{ + unsigned int retval; + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("ld.relaxed.gpu.u32 %0, [%1];" + : "=r"(retval) + : _CUB_ASM_PTR_(ptr) + : "memory");), + (asm volatile("ld.cg.u32 %0, [%1];" + : "=r"(retval) + : _CUB_ASM_PTR_(ptr) + : "memory");)); + + return retval; +} + +static __device__ __forceinline__ unsigned short load_relaxed(unsigned short const *ptr) +{ + unsigned short retval; + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("ld.relaxed.gpu.u16 %0, [%1];" + : "=h"(retval) + : _CUB_ASM_PTR_(ptr) + : "memory");), + (asm volatile("ld.cg.u16 %0, [%1];" + : "=h"(retval) + : _CUB_ASM_PTR_(ptr) + : "memory");)); + return retval; +} + +static __device__ __forceinline__ unsigned char load_relaxed(unsigned char const *ptr) +{ + unsigned short retval; + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("{" + " .reg .u8 datum;" + " ld.relaxed.gpu.u8 datum, [%1];" + " cvt.u16.u8 %0, datum;" + "}" + : "=h"(retval) + : _CUB_ASM_PTR_(ptr) + : "memory");), + (asm volatile("{" + " .reg .u8 datum;" + " ld.cg.u8 datum, [%1];" + " cvt.u16.u8 %0, datum;" + "}" + : "=h"(retval) + : _CUB_ASM_PTR_(ptr) + : "memory");)); + return (unsigned char)retval; +} + +} // namespace detail + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +CUB_NAMESPACE_END + diff --git a/cub/detail/strong_store.cuh b/cub/detail/strong_store.cuh new file mode 100644 index 0000000000..fd293519a3 --- /dev/null +++ b/cub/detail/strong_store.cuh @@ -0,0 +1,279 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * @file Utilities for strong memory operations. + */ + +#pragma once + +#include +#include +#include +#include + +CUB_NAMESPACE_BEGIN + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +namespace detail +{ + +static __device__ __forceinline__ void store_relaxed(uint4 *ptr, uint4 val) +{ + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.v4.u32 [%0], {%1, %2, %3, %4};" + : + : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) + : "memory");), + (asm volatile("st.cg.v4.u32 [%0], {%1, %2, %3, %4};" + : + : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) + : "memory");)); +} + +static __device__ __forceinline__ void store_relaxed(ulonglong2 *ptr, ulonglong2 val) +{ + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.v2.u64 [%0], {%1, %2};" + : + : _CUB_ASM_PTR_(ptr), "l"(val.x), "l"(val.y) + : "memory");), + (asm volatile("st.cg.v2.u64 [%0], {%1, %2};" + : + : _CUB_ASM_PTR_(ptr), "l"(val.x), "l"(val.y) + : "memory");)); +} + +static __device__ __forceinline__ void store_relaxed(ushort4 *ptr, ushort4 val) +{ + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.v4.u16 [%0], {%1, %2, %3, %4};" + : + : _CUB_ASM_PTR_(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) + : "memory");), + (asm volatile("st.cg.v4.u16 [%0], {%1, %2, %3, %4};" + : + : _CUB_ASM_PTR_(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) + : "memory");)); +} + +static __device__ __forceinline__ void store_relaxed(uint2 *ptr, uint2 val) +{ + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.v2.u32 [%0], {%1, %2};" + : + : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y) + : "memory");), + (asm volatile("st.cg.v2.u32 [%0], {%1, %2};" + : + : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y) + : "memory");)); +} + +static __device__ __forceinline__ void store_relaxed(unsigned long long *ptr, + unsigned long long val) +{ + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.u64 [%0], %1;" + : + : _CUB_ASM_PTR_(ptr), "l"(val) + : "memory");), + (asm volatile("st.cg.u64 [%0], %1;" + : + : _CUB_ASM_PTR_(ptr), "l"(val) + : "memory");)); +} + +static __device__ __forceinline__ void store_relaxed(unsigned int *ptr, unsigned int val) +{ + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.u32 [%0], %1;" + : + : _CUB_ASM_PTR_(ptr), "r"(val) + : "memory");), + (asm volatile("st.cg.u32 [%0], %1;" + : + : _CUB_ASM_PTR_(ptr), "r"(val) + : "memory");)); +} + +static __device__ __forceinline__ void store_relaxed(unsigned short *ptr, unsigned short val) +{ + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.u16 [%0], %1;" + : + : _CUB_ASM_PTR_(ptr), "h"(val) + : "memory");), + (asm volatile("st.cg.u16 [%0], %1;" + : + : _CUB_ASM_PTR_(ptr), "h"(val) + : "memory");)); +} + +static __device__ __forceinline__ void store_relaxed(unsigned char *ptr, unsigned char val) +{ + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("{" + " .reg .u8 datum;" + " cvt.u8.u16 datum, %1;" + " st.relaxed.gpu.u8 [%0], datum;" + "}" + : + : _CUB_ASM_PTR_(ptr), "h"((unsigned short)val) + : "memory");), + (asm volatile("{" + " .reg .u8 datum;" + " cvt.u8.u16 datum, %1;" + " st.cg.u8 [%0], datum;" + "}" + : + : _CUB_ASM_PTR_(ptr), "h"((unsigned short)val) + : "memory");)); +} + +__device__ __forceinline__ void store_release(uint4 *ptr, uint4 val) +{ + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.release.gpu.v4.u32 [%0], {%1, %2, %3, %4};" + : + : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) + : "memory");), + (__threadfence(); + asm volatile("st.cg.v4.u32 [%0], {%1, %2, %3, %4};" + : + : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) + : "memory");)); +} + +__device__ __forceinline__ void store_release(ulonglong2 *ptr, ulonglong2 val) +{ + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.release.gpu.v2.u64 [%0], {%1, %2};" + : + : _CUB_ASM_PTR_(ptr), "l"(val.x), "l"(val.y) + : "memory");), + (__threadfence(); asm volatile("st.cg.v2.u64 [%0], {%1, %2};" + : + : _CUB_ASM_PTR_(ptr), "l"(val.x), "l"(val.y) + : "memory");)); +} + +__device__ __forceinline__ void store_release(ushort4 *ptr, ushort4 val) +{ + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.release.gpu.v4.u16 [%0], {%1, %2, %3, %4};" + : + : _CUB_ASM_PTR_(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) + : "memory");), + (__threadfence(); + asm volatile("st.cg.v4.u16 [%0], {%1, %2, %3, %4};" + : + : _CUB_ASM_PTR_(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) + : "memory");)); +} + +__device__ __forceinline__ void store_release(uint2 *ptr, uint2 val) +{ + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.release.gpu.v2.u32 [%0], {%1, %2};" + : + : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y) + : "memory");), + (__threadfence(); asm volatile("st.cg.v2.u32 [%0], {%1, %2};" + : + : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y) + : "memory");)); +} + +__device__ __forceinline__ void store_release(unsigned long long *ptr, unsigned long long val) +{ + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.release.gpu.u64 [%0], %1;" + : + : _CUB_ASM_PTR_(ptr), "l"(val) + : "memory");), + (__threadfence(); asm volatile("st.cg.u64 [%0], %1;" + : + : _CUB_ASM_PTR_(ptr), "l"(val) + : "memory");)); +} + +__device__ __forceinline__ void store_release(unsigned int *ptr, unsigned int val) +{ + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.release.gpu.u32 [%0], %1;" + : + : _CUB_ASM_PTR_(ptr), "r"(val) + : "memory");), + (__threadfence(); asm volatile("st.cg.u32 [%0], %1;" + : + : _CUB_ASM_PTR_(ptr), "r"(val) + : "memory");)); +} + +__device__ __forceinline__ void store_release(unsigned short *ptr, unsigned short val) +{ + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.release.gpu.u16 [%0], %1;" + : + : _CUB_ASM_PTR_(ptr), "h"(val) + : "memory");), + (__threadfence(); asm volatile("st.cg.u16 [%0], %1;" + : + : _CUB_ASM_PTR_(ptr), "h"(val) + : "memory");)); +} + +__device__ __forceinline__ void store_release(unsigned char *ptr, unsigned char val) +{ + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("{" + " .reg .u8 datum;" + " cvt.u8.u16 datum, %1;" + " st.release.gpu.u8 [%0], datum;" + "}" + : + : _CUB_ASM_PTR_(ptr), "h"((unsigned short)val) + : "memory");), + (__threadfence(); asm volatile("{" + " .reg .u8 datum;" + " cvt.u8.u16 datum, %1;" + " st.cg.u8 [%0], datum;" + "}" + : + : _CUB_ASM_PTR_(ptr), "h"((unsigned short)val) + : "memory");)); +} + +} // namespace detail + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +CUB_NAMESPACE_END +